From df7a9d1407cf189a86dc54c8de7b7de4936dd842 Mon Sep 17 00:00:00 2001
From: Andrey Taranik <andrey@cicd.team>
Date: Thu, 17 Mar 2022 00:43:28 +0300
Subject: [PATCH 01/63] release fix 2022-03-16 (#1375)

---
 .circleci/ansible/deploy.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml
index c95524a8a5..2dd109f99a 100644
--- a/.circleci/ansible/deploy.yaml
+++ b/.circleci/ansible/deploy.yaml
@@ -119,7 +119,7 @@
       shell:
         cmd: |
           INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
+          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
       tags:
       - pageserver
 
@@ -169,6 +169,6 @@
       shell:
         cmd: |
           INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
+          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
       tags:
       - safekeeper

From 73f247d537ebea4719461e0d0d3a8c5c92e45bb0 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 31 May 2022 16:00:50 +0400
Subject: [PATCH 02/63] Bump vendor/postgres to hotfix basebackup LSN
 comparison.

---
 vendor/postgres | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/postgres b/vendor/postgres
index 038b2b98e5..658157375a 160000
--- a/vendor/postgres
+++ b/vendor/postgres
@@ -1 +1 @@
-Subproject commit 038b2b98e5c3d6274cbd43e9b822cdd946cb8b91
+Subproject commit 658157375a2b1b574766c1a055dde224c269a2f8

From cf350c6002e1107f8b800b5cc4e4f273ea432c5f Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 31 May 2022 17:36:35 +0200
Subject: [PATCH 03/63] Use :local compute-tools tag to build compute-node
 image

---
 .circleci/config.yml | 24 ++++++++++++++----------
 vendor/postgres      |  2 +-
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 5346e35c01..3377b907cb 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -462,9 +462,6 @@ jobs:
       - checkout
       - setup_remote_docker:
           docker_layer_caching: true
-      # Build neondatabase/compute-tools:latest image and push it to Docker hub
-      # TODO: this should probably also use versioned tag, not just :latest.
-      # XXX: but should it? We build and use it only locally now.
       - run:
           name: Build and push compute-tools Docker image
           command: |
@@ -472,7 +469,10 @@ jobs:
             docker build \
               --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
               --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
-              --tag neondatabase/compute-tools:latest -f Dockerfile.compute-tools .
+              --tag neondatabase/compute-tools:local \
+              --tag neondatabase/compute-tools:latest \
+              -f Dockerfile.compute-tools .
+            # Only push :latest image
             docker push neondatabase/compute-tools:latest
       - run:
           name: Init postgres submodule
@@ -482,7 +482,9 @@ jobs:
           command: |
             echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
             DOCKER_TAG=$(git log --oneline|wc -l)
-            docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:latest vendor/postgres
+            docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
+              --tag neondatabase/compute-node:latest vendor/postgres \
+              --build-arg COMPUTE_TOOLS_TAG=local
             docker push neondatabase/compute-node:${DOCKER_TAG}
             docker push neondatabase/compute-node:latest
 
@@ -519,9 +521,6 @@ jobs:
       - checkout
       - setup_remote_docker:
           docker_layer_caching: true
-      # Build neondatabase/compute-tools:release image and push it to Docker hub
-      # TODO: this should probably also use versioned tag, not just :latest.
-      # XXX: but should it? We build and use it only locally now.
       - run:
           name: Build and push compute-tools Docker image
           command: |
@@ -529,7 +528,10 @@ jobs:
             docker build \
               --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
               --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
-              --tag neondatabase/compute-tools:release -f Dockerfile.compute-tools .
+              --tag neondatabase/compute-tools:release \
+              --tag neondatabase/compute-tools:local \
+              -f Dockerfile.compute-tools .
+            # Only push :release image
             docker push neondatabase/compute-tools:release
       - run:
           name: Init postgres submodule
@@ -539,7 +541,9 @@ jobs:
           command: |
             echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
             DOCKER_TAG="release-$(git log --oneline|wc -l)"
-            docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:release vendor/postgres
+            docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
+              --tag neondatabase/compute-node:release vendor/postgres \
+              --build-arg COMPUTE_TOOLS_TAG=local
             docker push neondatabase/compute-node:${DOCKER_TAG}
             docker push neondatabase/compute-node:release
 
diff --git a/vendor/postgres b/vendor/postgres
index 658157375a..dba273190e 160000
--- a/vendor/postgres
+++ b/vendor/postgres
@@ -1 +1 @@
-Subproject commit 658157375a2b1b574766c1a055dde224c269a2f8
+Subproject commit dba273190e546c2a6345c38435e91780797c734f

From cc856eca85eb6fb586537583b215b790d34cdb7d Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 31 May 2022 18:35:06 +0200
Subject: [PATCH 04/63] Install missing openssl packages in the Github Actions
 workflow

---
 .github/workflows/testing.yml | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 79b2ba05d0..ad7bddfabc 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -34,11 +34,11 @@ jobs:
         if: matrix.os == 'ubuntu-latest'
         run: |
           sudo apt update
-          sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
+          sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev libssl-dev
 
-      - name: Install macOs postgres dependencies
+      - name: Install macOS postgres dependencies
         if: matrix.os == 'macos-latest'
-        run: brew install flex bison
+        run: brew install flex bison openssl
 
       - name: Set pg revision for caching
         id: pg_ver
@@ -52,10 +52,27 @@ jobs:
             tmp_install/
           key: ${{ runner.os }}-pg-${{ steps.pg_ver.outputs.pg_rev }}
 
+      - name: Set extra env for macOS
+        if: matrix.os == 'macos-latest'
+        run: |
+          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
+          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
+
       - name: Build postgres
         if: steps.cache_pg.outputs.cache-hit != 'true'
         run: make postgres
 
+      # Plain configure output can contain weird errors like 'error: C compiler cannot create executables'
+      # and the real cause will be inside config.log
+      - name: Print configure logs in case of failure
+        if: failure()
+        continue-on-error: true
+        run: |
+          echo '' && echo '=== config.log ===' && echo ''
+          cat tmp_install/build/config.log
+          echo '' && echo '=== configure.log ===' && echo ''
+          cat tmp_install/build/configure.log
+
       - name: Cache cargo deps
         id: cache_cargo
         uses: actions/cache@v2

From 93467eae1f1b6dd0bb66bc0a2869b3f6e3f6afe5 Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Sat, 22 Oct 2022 02:26:28 +0300
Subject: [PATCH 05/63] Hotfix to disable grant create on public schema

`GRANT CREATE ON SCHEMA public` fails if there is no schema `public`.
Disable it in release for now and make a better fix later (it is
needed for v15 support).
---
 compute_tools/src/spec.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index e0c0e9404b..1e7cd51b6e 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -423,11 +423,11 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
         );
         db_client.simple_query(&alter_query)?;
 
-        // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
-        // This is needed since postgres 15, where this privilege is removed by default.
-        let grant_query: String = "GRANT CREATE ON SCHEMA public TO web_access".to_string();
-        info!("grant query for db {} : {}", &db.name, &grant_query);
-        db_client.simple_query(&grant_query)?;
+        // // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
+        // // This is needed since postgres 15, where this privilege is removed by default.
+        // let grant_query: String = "GRANT CREATE ON SCHEMA public TO web_access".to_string();
+        // info!("grant query for db {} : {}", &db.name, &grant_query);
+        // db_client.simple_query(&grant_query)?;
     }
 
     Ok(())

From 323c4ecb4fc67d3ca63de9800fcafa00dfde9a91 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 25 Oct 2022 16:41:50 +0200
Subject: [PATCH 06/63] Add data format backward compatibility tests (#2626)

---
 .../actions/run-python-test-set/action.yml    |  18 ++
 .github/workflows/build_and_test.yml          |  23 +-
 poetry.lock                                   |  52 +++-
 pyproject.toml                                |   2 +
 test_runner/fixtures/neon_fixtures.py         |   2 +-
 test_runner/regress/test_compatibility.py     | 267 ++++++++++++++++++
 6 files changed, 352 insertions(+), 12 deletions(-)
 create mode 100644 test_runner/regress/test_compatibility.py

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index cc6ab65b76..07cb7edbe7 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -73,6 +73,13 @@ runs:
       shell: bash -euxo pipefail {0}
       run: ./scripts/pysync
 
+    - name: Download compatibility snapshot for Postgres 14
+      uses: ./.github/actions/download
+      with:
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg14
+        path: /tmp/compatibility_snapshot_pg14
+        prefix: latest
+
     - name: Run pytest
       env:
         NEON_BIN: /tmp/neon/bin
@@ -80,6 +87,8 @@ runs:
         BUILD_TYPE: ${{ inputs.build_type }}
         AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
         AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
+        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
+        ALLOW_BREAKING_CHANGES: contains(github.event.pull_request.labels.*.name, 'breaking changes')
       shell: bash -euxo pipefail {0}
       run: |
         # PLATFORM will be embedded in the perf test report
@@ -154,6 +163,15 @@ runs:
           scripts/generate_and_push_perf_report.sh
         fi
 
+    - name: Upload compatibility snapshot for Postgres 14
+      if: github.ref_name == 'release'
+      uses: ./.github/actions/upload
+      with:
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
+        # The path includes a test name (test_prepare_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
+        path: /tmp/test_output/test_prepare_snapshot/compatibility_snapshot_pg14/
+        prefix: latest
+
     - name: Create Allure report
       if: always()
       uses: ./.github/actions/allure-report
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 14ee61c5b9..660f93b025 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -844,7 +844,7 @@ jobs:
           submodules: true
           fetch-depth: 0
 
-      - name: Configure environment 
+      - name: Configure environment
         run: |
           helm repo add neondatabase https://neondatabase.github.io/helm-charts
           aws --region us-east-2 eks update-kubeconfig --name dev-us-east-2-beta --role-arn arn:aws:iam::369495373322:role/github-runner
@@ -853,3 +853,24 @@ jobs:
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
           helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+
+  promote-compatibility-test-snapshot:
+    runs-on: dev
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+    needs: [ deploy, deploy-proxy ]
+    if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
+    steps:
+      - name: Promote compatibility snapshot for the release
+        shell: bash -euxo pipefail {0}
+        env:
+          BUCKET: neon-github-public-dev
+          PREFIX: artifacts/latest
+        run: |
+          for build_type in debug release; do
+            OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
+            NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst
+
+            time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
+          done
diff --git a/poetry.lock b/poetry.lock
index 27de8508ce..dfcb16107f 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -11,7 +11,7 @@ async-timeout = ">=3.0,<5.0"
 psycopg2-binary = ">=2.8.4"
 
 [package.extras]
-sa = ["sqlalchemy[postgresql_psycopg2binary] (>=1.3,<1.5)"]
+sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
 
 [[package]]
 name = "allure-pytest"
@@ -80,7 +80,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"]
 docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"]
 tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"]
-tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
+tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
 
 [[package]]
 name = "aws-sam-translator"
@@ -560,7 +560,7 @@ optional = false
 python-versions = ">=3.6.0"
 
 [package.extras]
-unicode_backport = ["unicodedata2"]
+unicode-backport = ["unicodedata2"]
 
 [[package]]
 name = "click"
@@ -593,7 +593,7 @@ python-versions = ">=3.6"
 cffi = ">=1.12"
 
 [package.extras]
-docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"]
+docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx_rtd_theme"]
 docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
 pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"]
 sdist = ["setuptools_rust (>=0.11.4)"]
@@ -738,9 +738,9 @@ python-versions = ">=3.6.1,<4.0"
 
 [package.extras]
 colors = ["colorama (>=0.4.3,<0.5.0)"]
-pipfile_deprecated_finder = ["pipreqs", "requirementslib"]
+pipfile-deprecated-finder = ["pipreqs", "requirementslib"]
 plugins = ["setuptools"]
-requirements_deprecated_finder = ["pip-api", "pipreqs"]
+requirements-deprecated-finder = ["pip-api", "pipreqs"]
 
 [[package]]
 name = "itsdangerous"
@@ -815,7 +815,7 @@ python-versions = ">=2.7"
 [package.extras]
 docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"]
 testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"]
-"testing.libs" = ["simplejson", "ujson", "yajl"]
+testing-libs = ["simplejson", "ujson", "yajl"]
 
 [[package]]
 name = "jsonpointer"
@@ -836,11 +836,12 @@ python-versions = "*"
 [package.dependencies]
 attrs = ">=17.4.0"
 pyrsistent = ">=0.14.0"
+setuptools = "*"
 six = ">=1.11.0"
 
 [package.extras]
 format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"]
-format_nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
+format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
 
 [[package]]
 name = "junit-xml"
@@ -900,6 +901,7 @@ pytz = "*"
 PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""}
 requests = ">=2.5"
 responses = ">=0.9.0"
+setuptools = {version = "*", optional = true, markers = "extra == \"server\""}
 sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""}
 werkzeug = ">=0.5,<2.2.0"
 xmltodict = "*"
@@ -1008,6 +1010,7 @@ python-versions = ">=3.7.0,<4.0.0"
 jsonschema = ">=3.2.0,<5.0.0"
 openapi-schema-validator = ">=0.2.0,<0.3.0"
 PyYAML = ">=5.1"
+setuptools = "*"
 
 [package.extras]
 requests = ["requests"]
@@ -1340,7 +1343,7 @@ urllib3 = ">=1.21.1,<1.27"
 
 [package.extras]
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
-use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"]
+use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
 [[package]]
 name = "responses"
@@ -1394,6 +1397,19 @@ python-versions = ">= 2.7"
 attrs = "*"
 pbr = "*"
 
+[[package]]
+name = "setuptools"
+version = "65.5.0"
+description = "Easily download, build, install, upgrade, and uninstall Python packages"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
+
 [[package]]
 name = "six"
 version = "1.16.0"
@@ -1460,6 +1476,14 @@ category = "main"
 optional = false
 python-versions = ">=3.7,<4.0"
 
+[[package]]
+name = "types-toml"
+version = "0.10.8"
+description = "Typing stubs for toml"
+category = "dev"
+optional = false
+python-versions = "*"
+
 [[package]]
 name = "types-urllib3"
 version = "1.26.17"
@@ -1544,7 +1568,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "ead1495454ee6d880bb240447025db93a25ebe263c2709de5f144cc2d85dc975"
+content-hash = "17cdbfe90f1b06dffaf24c3e076384ec08dd4a2dce5a05e50565f7364932eb2d"
 
 [metadata.files]
 aiopg = [
@@ -2182,6 +2206,10 @@ sarif-om = [
     {file = "sarif_om-1.0.4-py3-none-any.whl", hash = "sha256:539ef47a662329b1c8502388ad92457425e95dc0aaaf995fe46f4984c4771911"},
     {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"},
 ]
+setuptools = [
+    {file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"},
+    {file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"},
+]
 six = [
     {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
@@ -2210,6 +2238,10 @@ types-s3transfer = [
     {file = "types-s3transfer-0.6.0.post3.tar.gz", hash = "sha256:92c3704e5d041202bfb5ddb79d083fd1a02de2c5dfec6a91576823e6b5c93993"},
     {file = "types_s3transfer-0.6.0.post3-py3-none-any.whl", hash = "sha256:eedc5117275565b3c83662c0ccc81662a34da5dda8bd502b89d296b6d5cb091d"},
 ]
+types-toml = [
+    {file = "types-toml-0.10.8.tar.gz", hash = "sha256:b7e7ea572308b1030dc86c3ba825c5210814c2825612ec679eb7814f8dd9295a"},
+    {file = "types_toml-0.10.8-py3-none-any.whl", hash = "sha256:8300fd093e5829eb9c1fba69cee38130347d4b74ddf32d0a7df650ae55c2b599"},
+]
 types-urllib3 = [
     {file = "types-urllib3-1.26.17.tar.gz", hash = "sha256:73fd274524c3fc7cd8cd9ceb0cb67ed99b45f9cb2831013e46d50c1451044800"},
     {file = "types_urllib3-1.26.17-py3-none-any.whl", hash = "sha256:0d027fcd27dbb3cb532453b4d977e05bc1e13aefd70519866af211b3003d895d"},
diff --git a/pyproject.toml b/pyproject.toml
index 1ee6fbe6f4..765e0b97eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,12 +28,14 @@ Werkzeug = "2.1.2"
 pytest-order = "^1.0.1"
 allure-pytest = "^2.10.0"
 pytest-asyncio = "^0.19.0"
+toml = "^0.10.2"
 
 [tool.poetry.dev-dependencies]
 flake8 = "^5.0.4"
 mypy = "==0.971"
 black = "^22.6.0"
 isort = "^5.10.1"
+types-toml = "^0.10.8"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 4b2638bb2a..38a0db7cf7 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -970,7 +970,7 @@ class NeonPageserverApiException(Exception):
 
 
 class NeonPageserverHttpClient(requests.Session):
-    def __init__(self, port: int, is_testing_enabled_or_skip, auth_token: Optional[str] = None):
+    def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None):
         super().__init__()
         self.port = port
         self.auth_token = auth_token
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
new file mode 100644
index 0000000000..944ff64390
--- /dev/null
+++ b/test_runner/regress/test_compatibility.py
@@ -0,0 +1,267 @@
+import os
+import re
+import shutil
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, Union
+
+import pytest
+import toml
+from fixtures.neon_fixtures import (
+    NeonCli,
+    NeonEnvBuilder,
+    NeonPageserverHttpClient,
+    PgBin,
+    PortDistributor,
+    wait_for_last_record_lsn,
+    wait_for_upload,
+)
+from fixtures.types import Lsn
+from pytest import FixtureRequest
+
+
+def dump_differs(first: Path, second: Path, output: Path) -> bool:
+    """
+    Runs diff(1) command on two SQL dumps and write the output to the given output file.
+    Returns True if the dumps differ, False otherwise.
+    """
+
+    with output.open("w") as stdout:
+        rv = subprocess.run(
+            [
+                "diff",
+                "--unified",  # Make diff output more readable
+                "--ignore-matching-lines=^--",  # Ignore changes in comments
+                "--ignore-blank-lines",
+                str(first),
+                str(second),
+            ],
+            stdout=stdout,
+        )
+
+    return rv.returncode != 0
+
+
+class PortReplacer(object):
+    """
+    Class-helper for replacing ports in config files.
+    """
+
+    def __init__(self, port_distributor: PortDistributor):
+        self.port_distributor = port_distributor
+        self.port_map: Dict[int, int] = {}
+
+    def replace_port(self, value: Union[int, str]) -> Union[int, str]:
+        if isinstance(value, int):
+            if (known_port := self.port_map.get(value)) is not None:
+                return known_port
+
+            self.port_map[value] = self.port_distributor.get_port()
+            return self.port_map[value]
+
+        if isinstance(value, str):
+            # Use regex to find port in a string
+            # urllib.parse.urlparse produces inconvenient results for cases without scheme like "localhost:5432"
+            # See https://bugs.python.org/issue27657
+            ports = re.findall(r":(\d+)(?:/|$)", value)
+            assert len(ports) == 1, f"can't find port in {value}"
+            port_int = int(ports[0])
+
+            if (known_port := self.port_map.get(port_int)) is not None:
+                return value.replace(f":{port_int}", f":{known_port}")
+
+            self.port_map[port_int] = self.port_distributor.get_port()
+            return value.replace(f":{port_int}", f":{self.port_map[port_int]}")
+
+        raise TypeError(f"unsupported type {type(value)} of {value=}")
+
+
+def test_backward_compatibility(
+    pg_bin: PgBin, port_distributor: PortDistributor, test_output_dir: Path, request: FixtureRequest
+):
+    compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR")
+    assert (
+        compatibility_snapshot_dir_env is not None
+    ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg14` path generateted by test_prepare_snapshot"
+    compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve()
+
+    # Make compatibility snapshot artifacts pickupable by Allure
+    # by copying the snapshot directory to the curent test output directory.
+    repo_dir = test_output_dir / "compatibility_snapshot" / "repo"
+
+    shutil.copytree(compatibility_snapshot_dir / "repo", repo_dir)
+
+    # Remove old logs to avoid confusion in test artifacts
+    for logfile in repo_dir.glob("**/*.log"):
+        logfile.unlink()
+
+    # Remove tenants data for computes
+    for tenant in (repo_dir / "pgdatadirs" / "tenants").glob("*"):
+        shutil.rmtree(tenant)
+
+    # Remove wal-redo temp directory
+    for tenant in (repo_dir / "tenants").glob("*"):
+        shutil.rmtree(tenant / "wal-redo-datadir.___temp")
+
+    # Update paths and ports in config files
+    pr = PortReplacer(port_distributor)
+
+    pageserver_toml = repo_dir / "pageserver.toml"
+    pageserver_config = toml.load(pageserver_toml)
+    new_local_path = pageserver_config["remote_storage"]["local_path"].replace(
+        "/test_prepare_snapshot/",
+        "/test_backward_compatibility/compatibility_snapshot/",
+    )
+
+    pageserver_config["remote_storage"]["local_path"] = new_local_path
+    pageserver_config["listen_http_addr"] = pr.replace_port(pageserver_config["listen_http_addr"])
+    pageserver_config["listen_pg_addr"] = pr.replace_port(pageserver_config["listen_pg_addr"])
+    pageserver_config["broker_endpoints"] = [
+        pr.replace_port(ep) for ep in pageserver_config["broker_endpoints"]
+    ]
+
+    with pageserver_toml.open("w") as f:
+        toml.dump(pageserver_config, f)
+
+    snapshot_config_toml = repo_dir / "config"
+    snapshot_config = toml.load(snapshot_config_toml)
+    snapshot_config["etcd_broker"]["broker_endpoints"] = [
+        pr.replace_port(ep) for ep in snapshot_config["etcd_broker"]["broker_endpoints"]
+    ]
+    snapshot_config["pageserver"]["listen_http_addr"] = pr.replace_port(
+        snapshot_config["pageserver"]["listen_http_addr"]
+    )
+    snapshot_config["pageserver"]["listen_pg_addr"] = pr.replace_port(
+        snapshot_config["pageserver"]["listen_pg_addr"]
+    )
+    for sk in snapshot_config["safekeepers"]:
+        sk["http_port"] = pr.replace_port(sk["http_port"])
+        sk["pg_port"] = pr.replace_port(sk["pg_port"])
+
+    with (snapshot_config_toml).open("w") as f:
+        toml.dump(snapshot_config, f)
+
+    # Ensure that snapshot doesn't contain references to the original path
+    rv = subprocess.run(
+        [
+            "grep",
+            "--recursive",
+            "--binary-file=without-match",
+            "--files-with-matches",
+            "test_prepare_snapshot/repo",
+            str(repo_dir),
+        ],
+        capture_output=True,
+        text=True,
+    )
+    assert (
+        rv.returncode != 0
+    ), f"there're files referencing `test_prepare_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}"
+
+    # NeonEnv stub to make NeonCli happy
+    config: Any = type("NeonEnvStub", (object,), {})
+    config.rust_log_override = None
+    config.repo_dir = repo_dir
+    config.pg_version = "14"  # Note: `pg_dumpall` (from pg_bin) version is set by DEFAULT_PG_VERSION_DEFAULT and can be overriden by DEFAULT_PG_VERSION env var
+    config.initial_tenant = snapshot_config["default_tenant_id"]
+
+    # Check that we can start the project
+    cli = NeonCli(config)
+    try:
+        cli.raw_cli(["start"])
+        request.addfinalizer(lambda: cli.raw_cli(["stop"]))
+
+        result = cli.pg_start("main")
+        request.addfinalizer(lambda: cli.pg_stop("main"))
+    except Exception:
+        breaking_changes_allowed = (
+            os.environ.get("ALLOW_BREAKING_CHANGES", "false").lower() == "true"
+        )
+        if breaking_changes_allowed:
+            pytest.xfail("Breaking changes are allowed by ALLOW_BREAKING_CHANGES env var")
+        else:
+            raise
+
+    connstr_all = re.findall(r"Starting postgres node at '([^']+)'", result.stdout)
+    assert len(connstr_all) == 1, f"can't parse connstr from {result.stdout}"
+    connstr = connstr_all[0]
+
+    # Check that the project produces the same dump as the previous version.
+    # The assert itself deferred to the end of the test
+    # to allow us to perform checks that change data before failing
+    pg_bin.run(["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"])
+    initial_dump_differs = dump_differs(
+        compatibility_snapshot_dir / "dump.sql",
+        test_output_dir / "dump.sql",
+        test_output_dir / "dump.filediff",
+    )
+
+    # Check that project can be recovered from WAL
+    # loosely based on https://github.com/neondatabase/cloud/wiki/Recovery-from-WAL
+    tenant_id = snapshot_config["default_tenant_id"]
+    timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
+    pageserver_port = snapshot_config["pageserver"]["listen_http_addr"].split(":")[-1]
+    auth_token = snapshot_config["pageserver"]["auth_token"]
+    pageserver_http = NeonPageserverHttpClient(
+        port=pageserver_port,
+        is_testing_enabled_or_skip=lambda: True,  # TODO: check if testing really enabled
+        auth_token=auth_token,
+    )
+
+    shutil.rmtree(repo_dir / "local_fs_remote_storage")
+    pageserver_http.timeline_delete(tenant_id, timeline_id)
+    pageserver_http.timeline_create(tenant_id, timeline_id)
+    pg_bin.run(
+        ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"]
+    )
+    # The assert itself deferred to the end of the test
+    # to allow us to perform checks that change data before failing
+    dump_from_wal_differs = dump_differs(
+        test_output_dir / "dump.sql",
+        test_output_dir / "dump-from-wal.sql",
+        test_output_dir / "dump-from-wal.filediff",
+    )
+
+    # Check that we can interract with the data
+    pg_bin.run(["pgbench", "--time=10", "--progress=2", connstr])
+
+    assert not dump_from_wal_differs, "dump from WAL differs"
+    assert not initial_dump_differs, "initial dump differs"
+
+
+@pytest.mark.order(after="test_backward_compatibility")
+# Note: if renaming this test, don't forget to update a reference to it in a workflow file:
+# "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml
+def test_prepare_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path):
+    # The test doesn't really test anything
+    # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`.
+    #
+    # There's no cleanup here, it allows to adjust the data in `test_backward_compatibility` itself without re-collecting it.
+    neon_env_builder.pg_version = "14"
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_local_fs_remote_storage()
+
+    env = neon_env_builder.init_start()
+    pg = env.postgres.create_start("main")
+    pg_bin.run(["pgbench", "--initialize", "--scale=10", pg.connstr()])
+    pg_bin.run(["pgbench", "--time=60", "--progress=2", pg.connstr()])
+    pg_bin.run(["pg_dumpall", f"--dbname={pg.connstr()}", f"--file={test_output_dir / 'dump.sql'}"])
+
+    snapshot_config = toml.load(test_output_dir / "repo" / "config")
+    tenant_id = snapshot_config["default_tenant_id"]
+    timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
+
+    pageserver_http = env.pageserver.http_client()
+    lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn)
+    wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
+
+    env.postgres.stop_all()
+    for sk in env.safekeepers:
+        sk.stop()
+    env.pageserver.stop()
+
+    shutil.copytree(test_output_dir, test_output_dir / "compatibility_snapshot_pg14")
+    # Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it

From 7a491f52c451172dd62729b93b35359145ee4661 Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Tue, 25 Oct 2022 11:25:22 -0400
Subject: [PATCH 07/63] Add draw_timeline binary (#2688)

---
 Cargo.lock                              |   7 ++
 Dockerfile                              |   3 +-
 pageserver/Cargo.toml                   |   1 +
 pageserver/src/bin/draw_timeline_dir.rs | 150 ++++++++++++++++++++++++
 4 files changed, 160 insertions(+), 1 deletion(-)
 create mode 100644 pageserver/src/bin/draw_timeline_dir.rs

diff --git a/Cargo.lock b/Cargo.lock
index 13774f7fe6..b39ca6e5a7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2170,6 +2170,7 @@ dependencies = [
  "serde_json",
  "serde_with",
  "signal-hook",
+ "svg_fmt",
  "tar",
  "tempfile",
  "thiserror",
@@ -3461,6 +3462,12 @@ version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"
 
+[[package]]
+name = "svg_fmt"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
+
 [[package]]
 name = "symbolic-common"
 version = "8.8.0"
diff --git a/Dockerfile b/Dockerfile
index cb4e213687..b0d934d480 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -44,7 +44,7 @@ COPY . .
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin safekeeper --bin proxy --locked --release \
+&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin proxy --locked --release \
     && cachepot -s
 
 # Build final image
@@ -65,6 +65,7 @@ RUN set -e \
 
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir   /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
 
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 2139e24ee2..b075b86aa1 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -67,6 +67,7 @@ remote_storage = { path = "../libs/remote_storage" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
 close_fds = "0.3.2"
 walkdir = "2.3.2"
+svg_fmt = "0.4.1"
 
 [dev-dependencies]
 criterion = "0.4"
diff --git a/pageserver/src/bin/draw_timeline_dir.rs b/pageserver/src/bin/draw_timeline_dir.rs
new file mode 100644
index 0000000000..ea1ff7f3c7
--- /dev/null
+++ b/pageserver/src/bin/draw_timeline_dir.rs
@@ -0,0 +1,150 @@
+//! A tool for visualizing the arrangement of layerfiles within a timeline.
+//!
+//! It reads filenames from stdin and prints a svg on stdout. The image is a plot in
+//! page-lsn space, where every delta layer is a rectangle and every image layer is a
+//! thick line. Legend:
+//! - The x axis (left to right) represents page index.
+//! - The y axis represents LSN, growing upwards.
+//!
+//! Coordinates in both axis are compressed for better readability.
+//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
+//!
+//! Example use:
+//! ```
+//! $ cd test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE
+//! $ ls | grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
+//! $ firefox out.svg
+//! ```
+//!
+//! This API was chosen so that we can easily work with filenames extracted from ssh,
+//! or from pageserver log files.
+//!
+//! TODO Consider shipping this as a grafana panel plugin:
+//!      https://grafana.com/tutorials/build-a-panel-plugin/
+use anyhow::Result;
+use pageserver::repository::Key;
+use std::cmp::Ordering;
+use std::io::{self, BufRead};
+use std::{
+    collections::{BTreeMap, BTreeSet},
+    ops::Range,
+};
+use svg_fmt::{rectangle, rgb, BeginSvg, EndSvg, Fill, Stroke};
+use utils::{lsn::Lsn, project_git_version};
+
+project_git_version!(GIT_VERSION);
+
+// Map values to their compressed coordinate - the index the value
+// would have in a sorted and deduplicated list of all values.
+fn build_coordinate_compression_map<T: Ord + Copy>(coords: Vec<T>) -> BTreeMap<T, usize> {
+    let set: BTreeSet<T> = coords.into_iter().collect();
+
+    let mut map: BTreeMap<T, usize> = BTreeMap::new();
+    for (i, e) in set.iter().enumerate() {
+        map.insert(*e, i);
+    }
+
+    map
+}
+
+fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
+    let split: Vec<&str> = name.split("__").collect();
+    let keys: Vec<&str> = split[0].split('-').collect();
+    let mut lsns: Vec<&str> = split[1].split('-').collect();
+    if lsns.len() == 1 {
+        lsns.push(lsns[0]);
+    }
+
+    let keys = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
+    let lsns = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap();
+    (keys, lsns)
+}
+
+fn main() -> Result<()> {
+    // Parse layer filenames from stdin
+    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
+    let stdin = io::stdin();
+    for line in stdin.lock().lines() {
+        let range = parse_filename(&line.unwrap());
+        ranges.push(range);
+    }
+
+    // Collect all coordinates
+    let mut keys: Vec<Key> = vec![];
+    let mut lsns: Vec<Lsn> = vec![];
+    for (keyr, lsnr) in &ranges {
+        keys.push(keyr.start);
+        keys.push(keyr.end);
+        lsns.push(lsnr.start);
+        lsns.push(lsnr.end);
+    }
+
+    // Analyze
+    let key_map = build_coordinate_compression_map(keys);
+    let lsn_map = build_coordinate_compression_map(lsns);
+
+    // Initialize stats
+    let mut num_deltas = 0;
+    let mut num_images = 0;
+
+    // Draw
+    let stretch = 3.0; // Stretch out vertically for better visibility
+    println!(
+        "{}",
+        BeginSvg {
+            w: key_map.len() as f32,
+            h: stretch * lsn_map.len() as f32
+        }
+    );
+    for (keyr, lsnr) in &ranges {
+        let key_start = *key_map.get(&keyr.start).unwrap();
+        let key_end = *key_map.get(&keyr.end).unwrap();
+        let key_diff = key_end - key_start;
+        let lsn_max = lsn_map.len();
+
+        if key_start >= key_end {
+            panic!("Invalid key range {}-{}", key_start, key_end);
+        }
+
+        let lsn_start = *lsn_map.get(&lsnr.start).unwrap();
+        let lsn_end = *lsn_map.get(&lsnr.end).unwrap();
+
+        let mut lsn_diff = (lsn_end - lsn_start) as f32;
+        let mut fill = Fill::None;
+        let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
+        let mut lsn_offset = 0.0;
+
+        // Fill in and thicken rectangle if it's an
+        // image layer so that we can see it.
+        match lsn_start.cmp(&lsn_end) {
+            Ordering::Less => num_deltas += 1,
+            Ordering::Equal => {
+                num_images += 1;
+                lsn_diff = 0.3;
+                lsn_offset = -lsn_diff / 2.0;
+                margin = 0.05;
+                fill = Fill::Color(rgb(0, 0, 0));
+            }
+            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
+        }
+
+        println!(
+            "    {}",
+            rectangle(
+                key_start as f32 + stretch * margin,
+                stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)),
+                key_diff as f32 - stretch * 2.0 * margin,
+                stretch * (lsn_diff - 2.0 * margin)
+            )
+            .fill(fill)
+            .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
+            .border_radius(0.4)
+        );
+    }
+    println!("{}", EndSvg);
+
+    eprintln!("num_images: {}", num_images);
+    eprintln!("num_deltas: {}", num_deltas);
+
+    Ok(())
+}

From 70c3d18bb0c72992de1438c0f77a4e2b9c72fcab Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Wed, 26 Oct 2022 02:51:23 +0300
Subject: [PATCH 08/63] Do not release to new staging proxies on release
 (#2685)

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 660f93b025..1b8b380179 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -832,7 +832,7 @@ jobs:
     # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
     needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
     if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      (github.ref_name == 'main') &&
       github.event_name != 'workflow_dispatch'
     defaults:
       run:

From ede70d833cc547c00dd6496eee4483a98e39a04a Mon Sep 17 00:00:00 2001
From: mikecaat <35882227+mikecaat@users.noreply.github.com>
Date: Wed, 26 Oct 2022 19:59:25 +0900
Subject: [PATCH 09/63] Add a docker-compose example file (#1943) (#2666)

Co-authored-by: Masahiro Ikeda <masahiro.ikeda.us@hco.ntt.co.jp>
---
 docker-compose/compute/shell/compute.sh       |  48 +++++
 .../compute/var/db/postgres/specs/spec.json   | 141 ++++++++++++
 docker-compose/docker-compose.yml             | 200 ++++++++++++++++++
 docker-compose/image/compute/Dockerfile       |  10 +
 docs/docker.md                                |  64 ++++++
 scripts/docker-compose_test.sh                |  51 +++++
 6 files changed, 514 insertions(+)
 create mode 100755 docker-compose/compute/shell/compute.sh
 create mode 100644 docker-compose/compute/var/db/postgres/specs/spec.json
 create mode 100644 docker-compose/docker-compose.yml
 create mode 100644 docker-compose/image/compute/Dockerfile
 create mode 100755 scripts/docker-compose_test.sh

diff --git a/docker-compose/compute/shell/compute.sh b/docker-compose/compute/shell/compute.sh
new file mode 100755
index 0000000000..cef2b485f3
--- /dev/null
+++ b/docker-compose/compute/shell/compute.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+set -eux
+
+PG_VERSION=${PG_VERSION:-14}
+
+SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
+SPEC_FILE=/tmp/spec.json
+
+echo "Waiting pageserver become ready."
+while ! nc -z pageserver 6400; do
+     sleep 1;
+done
+echo "Page server is ready."
+
+echo "Create a tenant and timeline"
+PARAMS=(
+     -sb 
+     -X POST
+     -H "Content-Type: application/json"
+     -d "{}"
+     http://pageserver:9898/v1/tenant/
+)
+tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g')
+
+PARAMS=(
+     -sb 
+     -X POST
+     -H "Content-Type: application/json"
+     -d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}"
+     "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
+)
+result=$(curl "${PARAMS[@]}")
+echo $result | jq .
+
+echo "Overwrite tenant id and timeline id in spec file"
+tenant_id=$(echo ${result} | jq -r .tenant_id)
+timeline_id=$(echo ${result} | jq -r .timeline_id)
+
+sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
+sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
+
+cat ${SPEC_FILE}
+
+echo "Start compute node"
+/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
+     -C "postgresql://cloud_admin@localhost:55433/postgres"  \
+     -b /usr/local/bin/postgres                              \
+     -S ${SPEC_FILE}
diff --git a/docker-compose/compute/var/db/postgres/specs/spec.json b/docker-compose/compute/var/db/postgres/specs/spec.json
new file mode 100644
index 0000000000..10ae0b0ecf
--- /dev/null
+++ b/docker-compose/compute/var/db/postgres/specs/spec.json
@@ -0,0 +1,141 @@
+{
+    "format_version": 1.0,
+
+    "timestamp": "2022-10-12T18:00:00.000Z",
+    "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
+
+    "cluster": {
+        "cluster_id": "docker_compose",
+        "name": "docker_compose_test",
+        "state": "restarted",
+        "roles": [
+            {
+                "name": "cloud_admin",
+                "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
+                "options": null
+            }
+        ],
+        "databases": [
+        ],
+        "settings": [
+            {
+                "name": "fsync",
+                "value": "off",
+                "vartype": "bool"
+            },
+            {
+                "name": "wal_level",
+                "value": "replica",
+                "vartype": "enum"
+            },
+            {
+                "name": "hot_standby",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "wal_log_hints",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "log_connections",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "port",
+                "value": "55433",
+                "vartype": "integer"
+            },
+            {
+                "name": "shared_buffers",
+                "value": "1MB",
+                "vartype": "string"
+            },
+            {
+                "name": "max_connections",
+                "value": "100",
+                "vartype": "integer"
+            },
+            {
+                "name": "listen_addresses",
+                "value": "0.0.0.0",
+                "vartype": "string"
+            },
+            {
+                "name": "max_wal_senders",
+                "value": "10",
+                "vartype": "integer"
+            },
+            {
+                "name": "max_replication_slots",
+                "value": "10",
+                "vartype": "integer"
+            },
+            {
+                "name": "wal_sender_timeout",
+                "value": "5s",
+                "vartype": "string"
+            },
+            {
+                "name": "wal_keep_size",
+                "value": "0",
+                "vartype": "integer"
+            },
+            {
+                "name": "password_encryption",
+                "value": "md5",
+                "vartype": "enum"
+            },
+            {
+                "name": "restart_after_crash",
+                "value": "off",
+                "vartype": "bool"
+            },
+            {
+                "name": "synchronous_standby_names",
+                "value": "walproposer",
+                "vartype": "string"
+            },
+            {
+                "name": "shared_preload_libraries",
+                "value": "neon",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.safekeepers",
+                "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.timeline_id",
+                "value": "TIMELINE_ID",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.tenant_id",
+                "value": "TENANT_ID",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.pageserver_connstring",
+                "value": "host=pageserver port=6400",
+                "vartype": "string"
+            },
+            {
+                "name": "max_replication_write_lag",
+                "value": "500MB",
+                "vartype": "string"
+            },
+            {
+                "name": "max_replication_flush_lag",
+                "value": "10GB",
+                "vartype": "string"
+            }
+        ]
+    },
+
+    "delta_operations": [
+    ]
+}
diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
new file mode 100644
index 0000000000..9ab775c3f9
--- /dev/null
+++ b/docker-compose/docker-compose.yml
@@ -0,0 +1,200 @@
+version: '3'
+
+services:
+  etcd:
+    image: quay.io/coreos/etcd:v3.5.4
+    ports:
+      - 2379:2379
+      - 2380:2380
+    environment:
+      # This signifficantly speeds up etcd and we anyway don't data persistency there.
+      ETCD_UNSAFE_NO_FSYNC: "1"
+    command: 
+      - "etcd"
+      - "--auto-compaction-mode=revision"
+      - "--auto-compaction-retention=1"
+      - "--name=etcd-cluster"
+      - "--initial-cluster-state=new"
+      - "--initial-cluster-token=etcd-cluster-1"
+      - "--initial-cluster=etcd-cluster=http://etcd:2380"
+      - "--initial-advertise-peer-urls=http://etcd:2380"
+      - "--advertise-client-urls=http://etcd:2379"
+      - "--listen-client-urls=http://0.0.0.0:2379"
+      - "--listen-peer-urls=http://0.0.0.0:2380"
+      - "--quota-backend-bytes=134217728" # 128 MB
+
+  minio:
+    image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
+    ports:
+      - 9000:9000
+      - 9001:9001
+    environment:
+      - MINIO_ROOT_USER=minio
+      - MINIO_ROOT_PASSWORD=password
+    command: server /data --address :9000 --console-address ":9001"
+
+  minio_create_buckets:
+    image: minio/mc
+    environment:
+      - MINIO_ROOT_USER=minio
+      - MINIO_ROOT_PASSWORD=password
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command: 
+      - "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do
+             echo 'Waiting to start minio...' && sleep 1;
+         done;
+         /usr/bin/mc mb minio/neon --region=eu-north-1;
+         exit 0;"
+    depends_on:
+      - minio
+
+  pageserver:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - BROKER_ENDPOINT='http://etcd:2379'
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+       #- 6400:6400  # pg protocol handler
+       - 9898:9898 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "/usr/local/bin/pageserver -D /data/.neon/
+                                   -c \"broker_endpoints=[$$BROKER_ENDPOINT]\"
+                                   -c \"listen_pg_addr='0.0.0.0:6400'\"
+                                   -c \"listen_http_addr='0.0.0.0:9898'\"
+                                   -c \"remote_storage={endpoint='http://minio:9000',
+                                                        bucket_name='neon',
+                                                        bucket_region='eu-north-1',
+                                                        prefix_in_bucket='/pageserver/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  safekeeper1:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
+      - SAFEKEEPER_ID=1
+      - BROKER_ENDPOINT=http://etcd:2379
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+      #- 5454:5454 # pg protocol handler
+      - 7676:7676 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
+                    --listen-http='0.0.0.0:7676'
+                    --id=$$SAFEKEEPER_ID
+                    --broker-endpoints=$$BROKER_ENDPOINT
+                    -D /data
+                    --remote-storage=\"{endpoint='http://minio:9000',
+                                        bucket_name='neon',
+                                        bucket_region='eu-north-1',
+                                        prefix_in_bucket='/safekeeper/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  safekeeper2:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
+      - SAFEKEEPER_ID=2
+      - BROKER_ENDPOINT=http://etcd:2379
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+      #- 5454:5454 # pg protocol handler
+      - 7677:7676 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
+                    --listen-http='0.0.0.0:7676'
+                    --id=$$SAFEKEEPER_ID
+                    --broker-endpoints=$$BROKER_ENDPOINT
+                    -D /data
+                    --remote-storage=\"{endpoint='http://minio:9000',
+                                        bucket_name='neon',
+                                        bucket_region='eu-north-1',
+                                        prefix_in_bucket='/safekeeper/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  safekeeper3:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
+      - SAFEKEEPER_ID=3
+      - BROKER_ENDPOINT=http://etcd:2379
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+      #- 5454:5454 # pg protocol handler
+      - 7678:7676 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
+                    --listen-http='0.0.0.0:7676'
+                    --id=$$SAFEKEEPER_ID
+                    --broker-endpoints=$$BROKER_ENDPOINT
+                    -D /data
+                    --remote-storage=\"{endpoint='http://minio:9000',
+                                        bucket_name='neon',
+                                        bucket_region='eu-north-1',
+                                        prefix_in_bucket='/safekeeper/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  compute:
+    build:
+      context: ./image/compute
+      args:
+        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}:${TAG:-latest}
+        - http_proxy=$http_proxy
+        - https_proxy=$https_proxy
+    environment:
+      - PG_VERSION=${PG_VERSION:-14}
+      #- RUST_BACKTRACE=1
+    volumes:
+      - ./compute/var/db/postgres/specs/:/var/db/postgres/specs/
+      - ./compute/shell/:/shell/
+    ports:
+      - 55433:55433 # pg protocol handler
+      - 3080:3080 # http endpoints
+    entrypoint:
+      - "/shell/compute.sh"
+    depends_on:
+      - safekeeper1
+      - safekeeper2
+      - safekeeper3
+      - pageserver
+
+  compute_is_ready:
+    image: postgres:latest
+    entrypoint:
+      - "/bin/bash"
+      - "-c"
+    command:
+      - "until pg_isready -h compute -p 55433 ; do
+            echo 'Waiting to start compute...' && sleep 1;
+         done"
+    depends_on:
+      - compute
diff --git a/docker-compose/image/compute/Dockerfile b/docker-compose/image/compute/Dockerfile
new file mode 100644
index 0000000000..1b9d8c4900
--- /dev/null
+++ b/docker-compose/image/compute/Dockerfile
@@ -0,0 +1,10 @@
+ARG COMPUTE_IMAGE=compute-node-v14:latest
+FROM neondatabase/${COMPUTE_IMAGE}
+
+USER root
+RUN apt-get update &&       \
+    apt-get install -y curl \
+                       jq   \
+                       netcat
+
+USER postgres
diff --git a/docs/docker.md b/docs/docker.md
index 100cdd248b..42f0048e6f 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -18,3 +18,67 @@ We build all images after a successful `release` tests run and push automaticall
 1. `neondatabase/compute-tools` and `neondatabase/compute-node`
 
 2. `neondatabase/neon`
+
+## Docker Compose example
+
+You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers.
+
+- etcd x 1
+- pageserver x 1
+- safekeeper x 3
+- compute x 1
+- MinIO x 1        # This is Amazon S3 compatible object storage
+
+### How to use
+
+1. create containers
+
+You can specify version of neon cluster using following environment values.
+- PG_VERSION: postgres version for compute (default is 14)
+- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
+```
+$ cd docker-compose/docker-compose.yml
+$ docker-compose down   # remove the conainers if exists
+$ PG_VERSION=15 TAG=2221 docker-compose up --build -d  # You can specify the postgres and image version
+Creating network "dockercompose_default" with the default driver
+Creating dockercompose_etcd3_1 ...
+(...omit...)
+```
+
+2. connect compute node
+```
+$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
+$ psql -h localhost -p 55433 -U cloud_admin
+postgres=# CREATE TABLE t(key int primary key, value text);
+CREATE TABLE
+postgres=# insert into t values(1,1);
+INSERT 0 1
+postgres=# select * from t;
+ key | value
+-----+-------
+   1 | 1
+(1 row)
+```
+
+3. If you want to see the log, you can use `docker-compose logs` command.
+```
+# check the container name you want to see
+$ docker ps
+CONTAINER ID   IMAGE                                              COMMAND                  CREATED         STATUS         PORTS                                                                                                                                  NAMES
+d6968a5ae912   dockercompose_compute                              "/shell/compute.sh"      5 minutes ago   Up 5 minutes   0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp                                                                                       dockercompose_compute_1
+(...omit...)
+
+$ docker logs -f dockercompose_compute_1
+2022-10-21 06:15:48.757 GMT [56] LOG:  connection authorized: user=cloud_admin database=postgres application_name=psql
+2022-10-21 06:17:00.307 GMT [56] LOG:  [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400'
+(...omit...)
+```
+
+4. If you want to see durable data in MinIO which is s3 compatible storage
+
+Access http://localhost:9001 and sign in.
+
+- Username: `minio`
+- Password: `password`
+
+You can see durable pages and WAL data in `neon` bucket.
\ No newline at end of file
diff --git a/scripts/docker-compose_test.sh b/scripts/docker-compose_test.sh
new file mode 100755
index 0000000000..b4551365f8
--- /dev/null
+++ b/scripts/docker-compose_test.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# this is a shortcut script to avoid duplication in CI
+set -eux -o pipefail
+
+SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+COMPOSE_FILE=$SCRIPT_DIR/../docker-compose/docker-compose.yml
+
+COMPUTE_CONTAINER_NAME=dockercompose_compute_1
+SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
+PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
+
+cleanup() {
+	echo "show container information"
+	docker ps
+	docker-compose -f $COMPOSE_FILE logs
+	echo "stop containers..."
+	docker-compose -f $COMPOSE_FILE down
+}
+
+echo "clean up containers if exists"
+cleanup
+
+for pg_version in 14 15; do
+	echo "start containers (pg_version=$pg_version)."
+	PG_VERSION=$pg_version TAG=latest docker-compose -f $COMPOSE_FILE up --build -d
+
+	echo "wait until the compute is ready. timeout after 60s. "
+	cnt=0
+	while sleep 1; do
+		# check timeout
+		cnt=`expr $cnt + 1`
+		if [ $cnt -gt 60 ]; then
+			echo "timeout before the compute is ready."
+			cleanup
+			exit 1
+		fi
+
+		# check if the compute is ready
+		set +o pipefail
+		result=`docker-compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
+		set -o pipefail
+		if [ $result -eq 1 ]; then
+			echo "OK. The compute is ready to connect."
+			echo "execute simple queries."
+			docker exec -it $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
+			cleanup
+			break
+		fi
+	done
+done

From b4c55f5d2445452329ee09527da1c3080503e23a Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Wed, 26 Oct 2022 17:32:31 -0400
Subject: [PATCH 10/63] Move pagestream api to libs/pageserver_api (#2698)

---
 Cargo.lock                                    |   3 +
 libs/pageserver_api/Cargo.toml                |   3 +
 libs/pageserver_api/src/lib.rs                |   1 +
 libs/pageserver_api/src/models.rs             | 161 +++++++++++++++++
 .../pageserver_api}/src/reltag.rs             |   0
 pageserver/src/basebackup.rs                  |   2 +-
 pageserver/src/import_datadir.rs              |   2 +-
 pageserver/src/lib.rs                         |   1 -
 pageserver/src/page_service.rs                | 166 +-----------------
 pageserver/src/pgdatadir_mapping.rs           |   2 +-
 pageserver/src/tenant/timeline.rs             |   2 +-
 pageserver/src/walingest.rs                   |   2 +-
 pageserver/src/walredo.rs                     |   2 +-
 13 files changed, 181 insertions(+), 166 deletions(-)
 rename {pageserver => libs/pageserver_api}/src/reltag.rs (100%)

diff --git a/Cargo.lock b/Cargo.lock
index b39ca6e5a7..3e67126add 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2189,7 +2189,10 @@ dependencies = [
 name = "pageserver_api"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
+ "bytes",
  "const_format",
+ "postgres_ffi",
  "serde",
  "serde_with",
  "utils",
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 5995325a2f..9121cd4989 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -7,6 +7,9 @@ edition = "2021"
 serde = { version = "1.0", features = ["derive"] }
 serde_with = "2.0"
 const_format = "0.2.21"
+anyhow = { version = "1.0", features = ["backtrace"] }
+bytes = "1.0.1"
 
 utils = { path = "../utils" }
+postgres_ffi = { path = "../postgres_ffi" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs
index a36c1692a9..4890d54f36 100644
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -2,6 +2,7 @@ use const_format::formatcp;
 
 /// Public API types
 pub mod models;
+pub mod reltag;
 
 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index dd40ba9e1c..4360f76fd1 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -7,6 +7,10 @@ use utils::{
     lsn::Lsn,
 };
 
+use crate::reltag::RelTag;
+use anyhow::bail;
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+
 /// A state of a tenant in pageserver's memory.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TenantState {
@@ -219,3 +223,160 @@ pub struct FailpointConfig {
 pub struct TimelineGcRequest {
     pub gc_horizon: Option<u64>,
 }
+
+// Wrapped in libpq CopyData
+pub enum PagestreamFeMessage {
+    Exists(PagestreamExistsRequest),
+    Nblocks(PagestreamNblocksRequest),
+    GetPage(PagestreamGetPageRequest),
+    DbSize(PagestreamDbSizeRequest),
+}
+
+// Wrapped in libpq CopyData
+pub enum PagestreamBeMessage {
+    Exists(PagestreamExistsResponse),
+    Nblocks(PagestreamNblocksResponse),
+    GetPage(PagestreamGetPageResponse),
+    Error(PagestreamErrorResponse),
+    DbSize(PagestreamDbSizeResponse),
+}
+
+#[derive(Debug)]
+pub struct PagestreamExistsRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub rel: RelTag,
+}
+
+#[derive(Debug)]
+pub struct PagestreamNblocksRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub rel: RelTag,
+}
+
+#[derive(Debug)]
+pub struct PagestreamGetPageRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub rel: RelTag,
+    pub blkno: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamDbSizeRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub dbnode: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamExistsResponse {
+    pub exists: bool,
+}
+
+#[derive(Debug)]
+pub struct PagestreamNblocksResponse {
+    pub n_blocks: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamGetPageResponse {
+    pub page: Bytes,
+}
+
+#[derive(Debug)]
+pub struct PagestreamErrorResponse {
+    pub message: String,
+}
+
+#[derive(Debug)]
+pub struct PagestreamDbSizeResponse {
+    pub db_size: i64,
+}
+
+impl PagestreamFeMessage {
+    pub fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
+        // TODO these gets can fail
+
+        // these correspond to the NeonMessageTag enum in pagestore_client.h
+        //
+        // TODO: consider using protobuf or serde bincode for less error prone
+        // serialization.
+        let msg_tag = body.get_u8();
+        match msg_tag {
+            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+            })),
+            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+            })),
+            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+                blkno: body.get_u32(),
+            })),
+            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                dbnode: body.get_u32(),
+            })),
+            _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
+        }
+    }
+}
+
+impl PagestreamBeMessage {
+    pub fn serialize(&self) -> Bytes {
+        let mut bytes = BytesMut::new();
+
+        match self {
+            Self::Exists(resp) => {
+                bytes.put_u8(100); /* tag from pagestore_client.h */
+                bytes.put_u8(resp.exists as u8);
+            }
+
+            Self::Nblocks(resp) => {
+                bytes.put_u8(101); /* tag from pagestore_client.h */
+                bytes.put_u32(resp.n_blocks);
+            }
+
+            Self::GetPage(resp) => {
+                bytes.put_u8(102); /* tag from pagestore_client.h */
+                bytes.put(&resp.page[..]);
+            }
+
+            Self::Error(resp) => {
+                bytes.put_u8(103); /* tag from pagestore_client.h */
+                bytes.put(resp.message.as_bytes());
+                bytes.put_u8(0); // null terminator
+            }
+            Self::DbSize(resp) => {
+                bytes.put_u8(104); /* tag from pagestore_client.h */
+                bytes.put_i64(resp.db_size);
+            }
+        }
+
+        bytes.into()
+    }
+}
diff --git a/pageserver/src/reltag.rs b/libs/pageserver_api/src/reltag.rs
similarity index 100%
rename from pageserver/src/reltag.rs
rename to libs/pageserver_api/src/reltag.rs
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index d0a57a473b..973c3cd3a6 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -22,8 +22,8 @@ use std::time::SystemTime;
 use tar::{Builder, EntryType, Header};
 use tracing::*;
 
-use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
+use pageserver_api::reltag::{RelTag, SlruKind};
 
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
 use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index ee3dc684e3..642e41765b 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -12,10 +12,10 @@ use tracing::*;
 use walkdir::WalkDir;
 
 use crate::pgdatadir_mapping::*;
-use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
 use crate::walrecord::DecodedWALRecord;
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::*;
 use postgres_ffi::waldecoder::WalStreamDecoder;
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index c75f940386..52a4cb0381 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -8,7 +8,6 @@ pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod profiling;
-pub mod reltag;
 pub mod repository;
 pub mod storage_sync;
 pub mod task_mgr;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index d61885314e..aec91bc7f1 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -10,8 +10,14 @@
 //
 
 use anyhow::{bail, ensure, Context, Result};
-use bytes::{Buf, BufMut, Bytes, BytesMut};
+use bytes::Bytes;
 use futures::{Stream, StreamExt};
+use pageserver_api::models::{
+    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
+    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
+    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
+    PagestreamNblocksRequest, PagestreamNblocksResponse,
+};
 use std::io;
 use std::net::TcpListener;
 use std::str;
@@ -35,7 +41,6 @@ use crate::config::{PageServerConf, ProfilingConfig};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::profiling::profpoint_start;
-use crate::reltag::RelTag;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::Timeline;
@@ -45,163 +50,6 @@ use crate::CheckpointConfig;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
 
-// Wrapped in libpq CopyData
-enum PagestreamFeMessage {
-    Exists(PagestreamExistsRequest),
-    Nblocks(PagestreamNblocksRequest),
-    GetPage(PagestreamGetPageRequest),
-    DbSize(PagestreamDbSizeRequest),
-}
-
-// Wrapped in libpq CopyData
-enum PagestreamBeMessage {
-    Exists(PagestreamExistsResponse),
-    Nblocks(PagestreamNblocksResponse),
-    GetPage(PagestreamGetPageResponse),
-    Error(PagestreamErrorResponse),
-    DbSize(PagestreamDbSizeResponse),
-}
-
-#[derive(Debug)]
-struct PagestreamExistsRequest {
-    latest: bool,
-    lsn: Lsn,
-    rel: RelTag,
-}
-
-#[derive(Debug)]
-struct PagestreamNblocksRequest {
-    latest: bool,
-    lsn: Lsn,
-    rel: RelTag,
-}
-
-#[derive(Debug)]
-struct PagestreamGetPageRequest {
-    latest: bool,
-    lsn: Lsn,
-    rel: RelTag,
-    blkno: u32,
-}
-
-#[derive(Debug)]
-struct PagestreamDbSizeRequest {
-    latest: bool,
-    lsn: Lsn,
-    dbnode: u32,
-}
-
-#[derive(Debug)]
-struct PagestreamExistsResponse {
-    exists: bool,
-}
-
-#[derive(Debug)]
-struct PagestreamNblocksResponse {
-    n_blocks: u32,
-}
-
-#[derive(Debug)]
-struct PagestreamGetPageResponse {
-    page: Bytes,
-}
-
-#[derive(Debug)]
-struct PagestreamErrorResponse {
-    message: String,
-}
-
-#[derive(Debug)]
-struct PagestreamDbSizeResponse {
-    db_size: i64,
-}
-
-impl PagestreamFeMessage {
-    fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
-        // TODO these gets can fail
-
-        // these correspond to the NeonMessageTag enum in pagestore_client.h
-        //
-        // TODO: consider using protobuf or serde bincode for less error prone
-        // serialization.
-        let msg_tag = body.get_u8();
-        match msg_tag {
-            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                },
-            })),
-            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                },
-            })),
-            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                },
-                blkno: body.get_u32(),
-            })),
-            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                dbnode: body.get_u32(),
-            })),
-            _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
-        }
-    }
-}
-
-impl PagestreamBeMessage {
-    fn serialize(&self) -> Bytes {
-        let mut bytes = BytesMut::new();
-
-        match self {
-            Self::Exists(resp) => {
-                bytes.put_u8(100); /* tag from pagestore_client.h */
-                bytes.put_u8(resp.exists as u8);
-            }
-
-            Self::Nblocks(resp) => {
-                bytes.put_u8(101); /* tag from pagestore_client.h */
-                bytes.put_u32(resp.n_blocks);
-            }
-
-            Self::GetPage(resp) => {
-                bytes.put_u8(102); /* tag from pagestore_client.h */
-                bytes.put(&resp.page[..]);
-            }
-
-            Self::Error(resp) => {
-                bytes.put_u8(103); /* tag from pagestore_client.h */
-                bytes.put(resp.message.as_bytes());
-                bytes.put_u8(0); // null terminator
-            }
-            Self::DbSize(resp) => {
-                bytes.put_u8(104); /* tag from pagestore_client.h */
-                bytes.put_i64(resp.db_size);
-            }
-        }
-
-        bytes.into()
-    }
-}
-
 fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Bytes>> + '_ {
     async_stream::try_stream! {
         loop {
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index ca931ed37d..0e334a63df 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -7,12 +7,12 @@
 //! Clarify that)
 //!
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::reltag::{RelTag, SlruKind};
 use crate::repository::*;
 use crate::tenant::Timeline;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{bail, ensure, Result};
 use bytes::{Buf, Bytes};
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, TimestampTz, TransactionId};
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 194ca0d857..6a96254df4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -37,8 +37,8 @@ use crate::metrics::TimelineMetrics;
 use crate::pgdatadir_mapping::BlockNumber;
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
-use crate::reltag::RelTag;
 use crate::tenant_config::TenantConfOpt;
+use pageserver_api::reltag::RelTag;
 
 use postgres_ffi::to_pg_timestamp;
 use utils::{
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 9a6b99d991..8c81ed824b 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -31,10 +31,10 @@ use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
 
 use crate::pgdatadir_mapping::*;
-use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
 use crate::walrecord::*;
 use crate::ZERO_PAGE;
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index e683c301d8..1cde11082e 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -43,10 +43,10 @@ use crate::metrics::{
     WAL_REDO_WAIT_TIME,
 };
 use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
-use crate::reltag::{RelTag, SlruKind};
 use crate::repository::Key;
 use crate::walrecord::NeonWalRecord;
 use crate::{config::PageServerConf, TEMP_FILE_SUFFIX};
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
 use postgres_ffi::v14::nonrelfile_utils::{

From ab0be7b8da124839dbaf5fc85978d292e6ef427c Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Thu, 27 Oct 2022 03:50:46 +0300
Subject: [PATCH 11/63] Avoid debian-testing packages in compute Dockerfiles

plv8 can only be built with a fairly new gold linker version. We used to install
it via binutils packages from testing, but it also updates libc and that causes
troubles in the resulting image as different extensions were built against
different libc versions. We could either use libc from debian-testing everywhere
or restrain from using testing packages and install necessary programs manually.
This patch uses the latter approach: gold for plv8 and cmake for h3 are
installed manually.

In a passing declare h3_postgis as a safe extension (previous omission).
---
 Dockerfile.compute-node-v14 | 87 ++++++++++++++++++++++---------------
 Dockerfile.compute-node-v15 | 74 ++++++++++++++++++-------------
 2 files changed, 95 insertions(+), 66 deletions(-)

diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14
index 6d2b285fa3..035dfc0d08 100644
--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node-v14
@@ -1,24 +1,26 @@
-ARG TAG=pinned
-# apparently, ARGs don't get replaced in RUN commands in kaniko
-# ARG POSTGIS_VERSION=3.3.0
-# ARG PLV8_VERSION=3.1.4
-# ARG PG_VERSION=v14
+#
+# This file is identical to the Dockerfile.compute-node-v15 file
+# except for the version of Postgres that is built.
+#
 
+ARG TAG=pinned
+
+#########################################################################################
 #
 # Layer "build-deps"
 #
+#########################################################################################
 FROM debian:bullseye-slim AS build-deps
-RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \
-    apt update
 RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-    libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev
+    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
+    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config
 
+#########################################################################################
 #
 # Layer "pg-build"
 # Build Postgres from the neon postgres repository.
 #
+#########################################################################################
 FROM build-deps AS pg-build
 COPY vendor/postgres-v14 postgres
 RUN cd postgres && \
@@ -29,22 +31,20 @@ RUN cd postgres && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
 
+#########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
-# PostGIS compiles against neon postgres sources without changes. Perhaps we
-# could even use the upstream binaries, compiled against vanilla Postgres, but
-# it would require some investigation to check that it works, and also keeps
-# working in the future. So for now, we compile our own binaries.
+#########################################################################################
 FROM build-deps AS postgis-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
     apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc
 
-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
-    tar xvzf postgis-3.3.0.tar.gz && \
-    cd postgis-3.3.0 && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
+    tar xvzf postgis-3.3.1.tar.gz && \
+    cd postgis-3.3.1 && \
     ./autogen.sh && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     ./configure && \
@@ -57,19 +57,29 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
 
+#########################################################################################
 #
 # Layer "plv8-build"
 # Build plv8
 #
+#########################################################################################
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5
+    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
 
-# https://github.com/plv8/plv8/issues/475
-# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing binutils
+# https://github.com/plv8/plv8/issues/475:
+#   v8 uses gold for linking and sets `--thread-count=4` which breaks
+#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
+# Install newer gold version manually as debian-testing binutils version updates
+# libc version, which in turn breaks other extension built against non-testing libc.
+RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
+    tar xvzf binutils-2.38.tar.gz && \
+    cd binutils-2.38 && \
+    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
+    cd ../bfd && ./configure && make bfdver.h && \
+    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
+    cp /usr/local/bin/ld.gold /usr/bin/gold
 
 # Sed is used to patch for https://github.com/plv8/plv8/issues/503
 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
@@ -77,21 +87,25 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
     cd plv8-3.1.4 && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
     rm -rf /plv8-* && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
 
+#########################################################################################
 #
 # Layer "h3-pg-build"
 # Build h3_pg
 #
+#########################################################################################
 FROM build-deps AS h3-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # packaged cmake is too old
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing cmake
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
+      -q -O /tmp/cmake-install.sh \
+      && chmod u+x /tmp/cmake-install.sh \
+      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
+      && rm /tmp/cmake-install.sh
 
 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
     tar xvzf h3.tgz  && \
@@ -110,12 +124,15 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
 
+#########################################################################################
 #
 # Layer "neon-pg-ext-build"
 # compile neon extensions
 #
+#########################################################################################
 FROM build-deps AS neon-pg-ext-build
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -128,16 +145,22 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
         -C pgxn/neon \
         -s install
 
+#########################################################################################
+#
 # Compile and run the Neon-specific `compute_ctl` binary
+#
+#########################################################################################
 FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
 RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
 
+#########################################################################################
 #
 # Clean up postgres folder before inclusion
 #
+#########################################################################################
 FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
 
@@ -155,10 +178,12 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a
 
+#########################################################################################
 #
 # Final layer
 # Put it all together into the final image
 #
+#########################################################################################
 FROM debian:bullseye-slim
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
@@ -175,8 +200,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libreadline8 for psql
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
-# GLIBC 2.34 for plv8.
-#     Debian bullseye provides GLIBC 2.31, so we install the library from testing
 #
 # Lastly, link compute_ctl into zenith_ctl while we're at it,
 # so that we don't need to put this in another layer.
@@ -189,12 +212,6 @@ RUN apt update &&  \
         libproj19 \
         libprotobuf-c1 && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
-    echo "Installing GLIBC 2.34" && \
-    echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \
-    apt update && \
-    apt install -y --no-install-recommends -t testing libc6 && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
 
 USER postgres
diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15
index b7b1f25103..0b6e570b44 100644
--- a/Dockerfile.compute-node-v15
+++ b/Dockerfile.compute-node-v15
@@ -4,26 +4,23 @@
 #
 
 ARG TAG=pinned
-# apparently, ARGs don't get replaced in RUN commands in kaniko
-# ARG POSTGIS_VERSION=3.3.1
-# ARG PLV8_VERSION=3.1.4
-# ARG PG_VERSION=v15
 
+#########################################################################################
 #
 # Layer "build-deps"
 #
+#########################################################################################
 FROM debian:bullseye-slim AS build-deps
-RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \
-    apt update
 RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-    libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev
+    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
+    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config
 
+#########################################################################################
 #
 # Layer "pg-build"
 # Build Postgres from the neon postgres repository.
 #
+#########################################################################################
 FROM build-deps AS pg-build
 COPY vendor/postgres-v15 postgres
 RUN cd postgres && \
@@ -34,14 +31,12 @@ RUN cd postgres && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
 
+#########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
-# PostGIS compiles against neon postgres sources without changes. Perhaps we
-# could even use the upstream binaries, compiled against vanilla Postgres, but
-# it would require some investigation to check that it works, and also keeps
-# working in the future. So for now, we compile our own binaries.
+#########################################################################################
 FROM build-deps AS postgis-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
@@ -62,19 +57,29 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
 
+#########################################################################################
 #
 # Layer "plv8-build"
 # Build plv8
 #
+#########################################################################################
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5
+    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
 
-# https://github.com/plv8/plv8/issues/475
-# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing binutils
+# https://github.com/plv8/plv8/issues/475:
+#   v8 uses gold for linking and sets `--thread-count=4` which breaks
+#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
+# Install newer gold version manually as debian-testing binutils version updates
+# libc version, which in turn breaks other extension built against non-testing libc.
+RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
+    tar xvzf binutils-2.38.tar.gz && \
+    cd binutils-2.38 && \
+    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
+    cd ../bfd && ./configure && make bfdver.h && \
+    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
+    cp /usr/local/bin/ld.gold /usr/bin/gold
 
 # Sed is used to patch for https://github.com/plv8/plv8/issues/503
 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
@@ -82,21 +87,25 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
     cd plv8-3.1.4 && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
     rm -rf /plv8-* && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
 
+#########################################################################################
 #
 # Layer "h3-pg-build"
 # Build h3_pg
 #
+#########################################################################################
 FROM build-deps AS h3-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # packaged cmake is too old
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing cmake
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
+      -q -O /tmp/cmake-install.sh \
+      && chmod u+x /tmp/cmake-install.sh \
+      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
+      && rm /tmp/cmake-install.sh
 
 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
     tar xvzf h3.tgz  && \
@@ -115,12 +124,15 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
 
+#########################################################################################
 #
 # Layer "neon-pg-ext-build"
 # compile neon extensions
 #
+#########################################################################################
 FROM build-deps AS neon-pg-ext-build
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -133,16 +145,22 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
         -C pgxn/neon \
         -s install
 
+#########################################################################################
+#
 # Compile and run the Neon-specific `compute_ctl` binary
+#
+#########################################################################################
 FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
 RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
 
+#########################################################################################
 #
 # Clean up postgres folder before inclusion
 #
+#########################################################################################
 FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
 
@@ -160,10 +178,12 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a
 
+#########################################################################################
 #
 # Final layer
 # Put it all together into the final image
 #
+#########################################################################################
 FROM debian:bullseye-slim
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
@@ -180,8 +200,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libreadline8 for psql
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
-# GLIBC 2.34 for plv8.
-#     Debian bullseye provides GLIBC 2.31, so we install the library from testing
 #
 # Lastly, link compute_ctl into zenith_ctl while we're at it,
 # so that we don't need to put this in another layer.
@@ -194,12 +212,6 @@ RUN apt update &&  \
         libproj19 \
         libprotobuf-c1 && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
-    echo "Installing GLIBC 2.34" && \
-    echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \
-    apt update && \
-    apt install -y --no-install-recommends -t testing libc6 && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
 
 USER postgres

From 8e5bb3ed4989bbf86ff8df16b145e27f28e8e9e7 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 27 Oct 2022 11:09:09 +0400
Subject: [PATCH 12/63] Enable etcd compaction in neon_local.

---
 control_plane/src/etcd.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/control_plane/src/etcd.rs b/control_plane/src/etcd.rs
index ccadfa8ce7..ca2df8a50b 100644
--- a/control_plane/src/etcd.rs
+++ b/control_plane/src/etcd.rs
@@ -52,6 +52,10 @@ pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
             // size smaller. Our test etcd clusters are very small.
             // See https://github.com/etcd-io/etcd/issues/7910
             "--quota-backend-bytes=100000000".to_string(),
+            // etcd doesn't compact (vacuum) with default settings,
+            // enable it to prevent space exhaustion.
+            "--auto-compaction-mode=revision".to_string(),
+            "--auto-compaction-retention=1".to_string(),
         ])
         .stdout(Stdio::from(etcd_stdout_file))
         .stderr(Stdio::from(etcd_stderr_file))

From 0816168296883b086dbc50af0b4ebc9fbdf3afe7 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 15 Dec 2022 12:36:57 +0400
Subject: [PATCH 13/63] Hotfix: terminate subscription if channel is full.

Might help as a hotfix, but need to understand root better.
---
 storage_broker/src/bin/storage_broker.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index fdf2637b4d..1a743394ad 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -369,8 +369,9 @@ impl BrokerService for Broker {
                     Err(RecvError::Lagged(skipped_msg)) => {
                         missed_msgs += skipped_msg;
                         if let Poll::Ready(_) = futures::poll!(Box::pin(warn_interval.tick())) {
-                            warn!("dropped {} messages, channel is full", missed_msgs);
-                            missed_msgs = 0;
+                            error!("subscription id={}, key={:?}, addr={:?} dropped {} messages, channel is full",
+                                subscriber.id, subscriber.key, subscriber.remote_addr, missed_msgs);
+                            Err(Status::new(Code::Internal, "full channel"))?;
                         }
                     }
                     Err(RecvError::Closed) => {

From d24de169a7ac1de84f08a420295e309a5946f893 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 16 Dec 2022 00:57:59 +0400
Subject: [PATCH 14/63] Deploy broker with L4 LB in new env.

Seems to be fixing issue with missing keepalives.
---
 .../ansible/prod.ap-southeast-1.hosts.yaml    |  2 +-
 .github/ansible/prod.eu-central-1.hosts.yaml  |  2 +-
 .github/ansible/prod.us-east-2.hosts.yaml     |  2 +-
 .github/ansible/prod.us-west-2.hosts.yaml     |  2 +-
 .github/ansible/staging.eu-west-1.hosts.yaml  |  2 +-
 .github/ansible/staging.us-east-2.hosts.yaml  |  2 +-
 ...ev-eu-west-1-zeta.neon-storage-broker.yaml | 33 ++++++++-----------
 ...ev-us-east-2-beta.neon-storage-broker.yaml | 33 ++++++++-----------
 ...utheast-1-epsilon.neon-storage-broker.yaml | 33 ++++++++-----------
 ...u-central-1-gamma.neon-storage-broker.yaml | 33 ++++++++-----------
 ...d-us-east-2-delta.neon-storage-broker.yaml | 33 ++++++++-----------
 ...rod-us-west-2-eta.neon-storage-broker.yaml | 33 ++++++++-----------
 .github/workflows/build_and_test.yml          |  4 +--
 13 files changed, 92 insertions(+), 122 deletions(-)

diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml
index bcc7bb3b16..648029c120 100644
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-ap-southeast-1
     bucket_region: ap-southeast-1
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml
index 2b372d0fcb..c285a9f3b6 100644
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-eu-central-1
     bucket_region: eu-central-1
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.gamma.eu-central-1.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml
index 7a4002ec88..1753068b8c 100644
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ b/.github/ansible/prod.us-east-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-us-east-2
     bucket_region: us-east-2
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.delta.us-east-2.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index 682ee5994d..7d6e49bf9c 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-us-west-2
     bucket_region: us-west-2
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.eta.us-west-2.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml
index 90f00175b0..cfcc3a9ae8 100644
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-dev-storage-eu-west-1
     bucket_region: eu-west-1
     console_mgmt_base_url: http://console-staging.local
-    broker_endpoint: https://storage-broker.zeta.eu-west-1.internal.aws.neon.build:443
+    broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index d2b7fae12a..78a4582e57 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-staging-storage-us-east-2
     bucket_region: us-east-2
     console_mgmt_base_url: http://console-staging.local
-    broker_endpoint: https://storage-broker.beta.us-east-2.internal.aws.neon.build:443
+    broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
index e876367a18..c6e682f571 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: staging
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.zeta.eu-west-1.internal.aws.neon.build
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.zeta.eu-west-1.internal.aws.neon.build
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
index dcf4b99de2..c7682d24c0 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: staging
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.beta.us-east-2.internal.aws.neon.build
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.beta.us-east-2.internal.aws.neon.build
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.beta.us-east-2.internal.aws.neon.build
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
index 0abc6ebaa1..92b1777d0b 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
index d44a3eab5c..f89df4533a 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.gamma.eu-central-1.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.gamma.eu-central-1.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
index b9eeff5681..8cbc1af7cf 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.delta.us-east-2.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.delta.us-east-2.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.delta.us-east-2.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
index 249f76303a..8a7488948d 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.eta.us-west-2.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.eta.us-west-2.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.eta.us-west-2.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7a887cbece..43b855a2b0 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1072,7 +1072,7 @@ jobs:
 
       - name: Deploy storage-broker
         run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
 
   deploy-proxy-prod-new:
     runs-on: prod
@@ -1149,7 +1149,7 @@ jobs:
 
       - name: Deploy storage-broker
         run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
 
   promote-compatibility-data:
     runs-on: [ self-hosted, dev, x64 ]

From 73ea0a0b0188cfa40d09fb458e91f3cb38ce7425 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 16 Dec 2022 13:40:01 +0200
Subject: [PATCH 15/63] fix(remote_storage): use cached credentials (#3128)

IMDSv2 has limits, and if we query it on every s3 interaction we are
going to go over those limits. Changes the s3_bucket client
configuration to use:
- ChainCredentialsProvider to handle env variables or imds usage
- LazyCachingCredentialsProvider to actually cache any credentials

Related: https://github.com/awslabs/aws-sdk-rust/issues/629
Possibly related: https://github.com/neondatabase/neon/issues/3118
---
 libs/remote_storage/src/s3_bucket.rs | 47 +++++++++++-----------------
 1 file changed, 18 insertions(+), 29 deletions(-)

diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index ab1e5da6c5..740f3753d8 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,14 +4,13 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.
 
-use std::env::var;
 use std::sync::Arc;
-use std::time::Duration;
 
 use anyhow::Context;
 use aws_config::{
-    environment::credentials::EnvironmentVariableCredentialsProvider, imds,
-    imds::credentials::ImdsCredentialsProvider, meta::credentials::provide_credentials_fn,
+    environment::credentials::EnvironmentVariableCredentialsProvider,
+    imds::credentials::ImdsCredentialsProvider,
+    meta::credentials::{CredentialsProviderChain, LazyCachingCredentialsProvider},
 };
 use aws_sdk_s3::{
     config::Config,
@@ -20,7 +19,6 @@ use aws_sdk_s3::{
     Client, Endpoint, Region,
 };
 use aws_smithy_http::body::SdkBody;
-use aws_types::credentials::{CredentialsError, ProvideCredentials};
 use hyper::Body;
 use tokio::{io, sync::Semaphore};
 use tokio_util::io::ReaderStream;
@@ -31,8 +29,6 @@ use crate::{
     Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
-const DEFAULT_IMDS_TIMEOUT: Duration = Duration::from_secs(10);
-
 pub(super) mod metrics {
     use metrics::{register_int_counter_vec, IntCounterVec};
     use once_cell::sync::Lazy;
@@ -122,30 +118,23 @@ impl S3Bucket {
             "Creating s3 remote storage for S3 bucket {}",
             aws_config.bucket_name
         );
+
+        let credentials_provider = {
+            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+            let env_creds = EnvironmentVariableCredentialsProvider::new();
+            // uses imds v2
+            let imds = ImdsCredentialsProvider::builder().build();
+
+            // finally add caching.
+            // this might change in future, see https://github.com/awslabs/aws-sdk-rust/issues/629
+            LazyCachingCredentialsProvider::builder()
+                .load(CredentialsProviderChain::first_try("env", env_creds).or_else("imds", imds))
+                .build()
+        };
+
         let mut config_builder = Config::builder()
             .region(Region::new(aws_config.bucket_region.clone()))
-            .credentials_provider(provide_credentials_fn(|| async {
-                match var("AWS_ACCESS_KEY_ID").is_ok() && var("AWS_SECRET_ACCESS_KEY").is_ok() {
-                    true => {
-                        EnvironmentVariableCredentialsProvider::new()
-                            .provide_credentials()
-                            .await
-                    }
-                    false => {
-                        let imds_client = imds::Client::builder()
-                            .connect_timeout(DEFAULT_IMDS_TIMEOUT)
-                            .read_timeout(DEFAULT_IMDS_TIMEOUT)
-                            .build()
-                            .await
-                            .map_err(CredentialsError::unhandled)?;
-                        ImdsCredentialsProvider::builder()
-                            .imds_client(imds_client)
-                            .build()
-                            .provide_credentials()
-                            .await
-                    }
-                }
-            }));
+            .credentials_provider(credentials_provider);
 
         if let Some(custom_endpoint) = aws_config.endpoint.clone() {
             let endpoint = Endpoint::immutable(

From ece05556002b9c95fb9ad8f0a475ab10468e77fd Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Wed, 14 Dec 2022 21:28:14 +0100
Subject: [PATCH 16/63] Push proxy metrics to Victoria Metrics (#3106)

---
 .../dev-eu-west-1-zeta.neon-proxy-scram.yaml  | 25 +++++++++++++++++++
 .../dev-us-east-2-beta.neon-proxy-link.yaml   | 25 +++++++++++++++++++
 ...s-east-2-beta.neon-proxy-scram-legacy.yaml | 25 +++++++++++++++++++
 .../dev-us-east-2-beta.neon-proxy-scram.yaml  | 25 +++++++++++++++++++
 .../helm-values/neon-stress.proxy-scram.yaml  | 25 +++++++++++++++++++
 .github/helm-values/neon-stress.proxy.yaml    | 25 +++++++++++++++++++
 ...-southeast-1-epsilon.neon-proxy-scram.yaml | 25 +++++++++++++++++++
 ...d-eu-central-1-gamma.neon-proxy-scram.yaml | 25 +++++++++++++++++++
 ...prod-us-east-2-delta.neon-proxy-scram.yaml | 25 +++++++++++++++++++
 .../prod-us-west-2-eta.neon-proxy-scram.yaml  | 25 +++++++++++++++++++
 .../helm-values/production.proxy-scram.yaml   | 25 +++++++++++++++++++
 .github/helm-values/production.proxy.yaml     | 25 +++++++++++++++++++
 .github/helm-values/staging.proxy-scram.yaml  | 25 +++++++++++++++++++
 .github/helm-values/staging.proxy.yaml        | 25 +++++++++++++++++++
 14 files changed, 350 insertions(+)

diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
index f89eea5972..ae9c1f2e40 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
index eeb025277b..093fac146a 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -38,3 +38,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
index ed710bc196..a2f932e4fb 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
index ba0109c1eb..1138536e94 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/neon-stress.proxy-scram.yaml b/.github/helm-values/neon-stress.proxy-scram.yaml
index dea47304a0..ed580349fc 100644
--- a/.github/helm-values/neon-stress.proxy-scram.yaml
+++ b/.github/helm-values/neon-stress.proxy-scram.yaml
@@ -25,3 +25,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/neon-stress.proxy.yaml b/.github/helm-values/neon-stress.proxy.yaml
index c3ecf6c743..94270ced09 100644
--- a/.github/helm-values/neon-stress.proxy.yaml
+++ b/.github/helm-values/neon-stress.proxy.yaml
@@ -34,3 +34,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
index a37a37406c..4e4aff1f9e 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
index 69d00a7e9c..94290a87e1 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
index 19d91fa4dc..1a4023708b 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
index f148188c48..2942d6a2aa 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/production.proxy-scram.yaml b/.github/helm-values/production.proxy-scram.yaml
index 399bc6d21b..c7143cd61a 100644
--- a/.github/helm-values/production.proxy-scram.yaml
+++ b/.github/helm-values/production.proxy-scram.yaml
@@ -23,3 +23,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/production.proxy.yaml b/.github/helm-values/production.proxy.yaml
index 9db68c1044..dbaf3cd096 100644
--- a/.github/helm-values/production.proxy.yaml
+++ b/.github/helm-values/production.proxy.yaml
@@ -32,3 +32,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/staging.proxy-scram.yaml b/.github/helm-values/staging.proxy-scram.yaml
index f249df3612..66f9921c9a 100644
--- a/.github/helm-values/staging.proxy-scram.yaml
+++ b/.github/helm-values/staging.proxy-scram.yaml
@@ -30,3 +30,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/staging.proxy.yaml b/.github/helm-values/staging.proxy.yaml
index 62b4c4a595..a22082e625 100644
--- a/.github/helm-values/staging.proxy.yaml
+++ b/.github/helm-values/staging.proxy.yaml
@@ -30,3 +30,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"

From f759b561f3f7489b536c8ed4638e34ea5c73c91a Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Thu, 29 Dec 2022 16:08:21 +0200
Subject: [PATCH 17/63] add pageserver to new region see
 https://github.com/neondatabase/aws/pull/116

---
 .github/ansible/prod.us-west-2.hosts.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index 7d6e49bf9c..9eb422a3ae 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -25,6 +25,8 @@ storage:
           ansible_host: i-0d9f6dfae0e1c780d 
         pageserver-1.us-west-2.aws.neon.tech:
           ansible_host: i-0c834be1dddba8b3f
+        pageserver-2.us-west-2.aws.neon.tech:
+          ansible_host: i-051642d372c0a4f32
 
     safekeepers:
       hosts:

From 06d25f2186cc4b525dc138a8d193da92001f0e96 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Wed, 28 Dec 2022 17:48:49 +0200
Subject: [PATCH 18/63] switch to debug from info to produce less noise

---
 pageserver/src/storage_sync2.rs   | 6 +++---
 pageserver/src/tenant/timeline.rs | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index 7cc0eac2bf..55dbeaff73 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -203,7 +203,7 @@ use std::sync::{Arc, Mutex};
 use anyhow::ensure;
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use tokio::runtime::Runtime;
-use tracing::{info, warn};
+use tracing::{debug, info, warn};
 use tracing::{info_span, Instrument};
 
 use utils::lsn::Lsn;
@@ -747,7 +747,7 @@ impl RemoteTimelineClient {
             // We can launch this task. Remove it from the queue first.
             let next_op = upload_queue.queued_operations.pop_front().unwrap();
 
-            info!("starting op: {}", next_op);
+            debug!("starting op: {}", next_op);
 
             // Update the counters
             match next_op {
@@ -930,7 +930,7 @@ impl RemoteTimelineClient {
                 task.op, retries
             );
         } else {
-            info!("remote task {} completed successfully", task.op);
+            debug!("remote task {} completed successfully", task.op);
         }
 
         // The task has completed succesfully. Remove it from the in-progress list.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a746fd9bf8..cd045d1081 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2288,7 +2288,7 @@ impl Timeline {
         // See storage_sync module level comment on consistency.
         // Do it here because we don't want to hold self.layers.write() while waiting.
         if let Some(remote_client) = &self.remote_client {
-            info!("waiting for upload ops to complete");
+            debug!("waiting for upload ops to complete");
             remote_client
                 .wait_completion()
                 .await
@@ -2499,7 +2499,7 @@ impl Timeline {
         // See storage_sync module level comment on consistency.
         // Do it here because we don't want to hold self.layers.write() while waiting.
         if let Some(remote_client) = &self.remote_client {
-            info!("waiting for upload ops to complete");
+            debug!("waiting for upload ops to complete");
             remote_client
                 .wait_completion()
                 .await

From cd01bbc715e9ac594af29f6a657430e7a0703877 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 22 Dec 2022 19:21:53 +0400
Subject: [PATCH 19/63] Move zenith-1-sk-3 to zenith-1-sk-4 (#3164)

---
 .github/ansible/production.hosts.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml
index d22c845966..3122a43801 100644
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -34,5 +34,5 @@ storage:
           console_region_id: aws-us-west-2
         zenith-1-sk-2:
           console_region_id: aws-us-west-2
-        zenith-1-sk-3:
+        zenith-1-sk-4:
           console_region_id: aws-us-west-2

From d90c5a03af3763949442686e118581e5cdd4dd90 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Tue, 17 Jan 2023 22:07:38 +0200
Subject: [PATCH 20/63] Add more io::Error context when fail to operate on a
 path (#3254)

I have a test failure that shows

```
Caused by:
    0: Failed to reconstruct a page image:
    1: Directory not empty (os error 39)
```

but does not really show where exactly that happens.

https://neon-github-public-dev.s3.amazonaws.com/reports/pr-3227/release/3823785365/index.html#categories/c0057473fc9ec8fb70876fd29a171ce8/7088dab272f2c7b7/?attachment=60fe6ed2add4d82d

The PR aims to add more context in debugging that issue.
---
 pageserver/src/walredo.rs | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index a552c05d63..fd0524016f 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -626,24 +626,20 @@ impl PostgresRedoProcess {
 
         // Create empty data directory for wal-redo postgres, deleting old one first.
         if datadir.exists() {
-            info!(
-                "old temporary datadir {} exists, removing",
-                datadir.display()
-            );
-            fs::remove_dir_all(&datadir)?;
+            info!("old temporary datadir {datadir:?} exists, removing");
+            fs::remove_dir_all(&datadir).map_err(|e| {
+                Error::new(
+                    e.kind(),
+                    format!("Old temporary dir {datadir:?} removal failure: {e}"),
+                )
+            })?;
         }
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).map_err(|e| {
-            Error::new(
-                ErrorKind::Other,
-                format!("incorrect pg_bin_dir path: {}", e),
-            )
-        })?;
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).map_err(|e| {
-            Error::new(
-                ErrorKind::Other,
-                format!("incorrect pg_lib_dir path: {}", e),
-            )
-        })?;
+        let pg_bin_dir_path = conf
+            .pg_bin_dir(pg_version)
+            .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_bin_dir path: {e}")))?;
+        let pg_lib_dir_path = conf
+            .pg_lib_dir(pg_version)
+            .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_lib_dir path: {e}")))?;
 
         info!("running initdb in {}", datadir.display());
         let initdb = Command::new(pg_bin_dir_path.join("initdb"))

From bd535b3371f4e45f13a3f9abbacc3efbf931616f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Jan 2023 02:29:05 +0200
Subject: [PATCH 21/63] If an error happens while checking for core dumps,
 don't panic.

If we panic, we skip the 30s wait in 'main', and don't give the
console a chance to observe the error. Which is not nice.

Spotted by @ololobus at
https://github.com/neondatabase/neon/pull/3352#discussion_r1072806981
---
 compute_tools/src/compute.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index c2c9ab2230..d652084e00 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -23,7 +23,7 @@ use std::sync::RwLock;
 
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use log::{info, warn};
+use log::{error, info, warn};
 use postgres::{Client, NoTls};
 use serde::{Serialize, Serializer};
 
@@ -311,8 +311,9 @@ impl ComputeNode {
             .wait()
             .expect("failed to start waiting on Postgres process");
 
-        self.check_for_core_dumps()
-            .expect("failed to check for core dumps");
+        if let Err(err) = self.check_for_core_dumps() {
+            error!("error while checking for core dumps: {err:?}");
+        }
 
         Ok(ecode)
     }

From 4992160677509996064df4f3dacc79c64fe2c9c2 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Wed, 18 Jan 2023 14:58:55 +0200
Subject: [PATCH 22/63] Fix metric_collection_endpoint for prod. It was
 incorrectly set to staging url

---
 .github/ansible/production.hosts.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml
index 22bace5ade..ecb847bd61 100644
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -7,7 +7,7 @@ storage:
     broker_endpoint: http://storage-broker.prod.local:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
       metric_collection_interval: 10min
       remote_storage:
         bucket_name: "{{ bucket_name }}"

From c85374295fd38e081c1a683281016355471852bd Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Wed, 18 Jan 2023 14:49:59 +0100
Subject: [PATCH 23/63] Change SENTRY_ENVIRONMENT from "development" to
 "staging"

---
 .github/ansible/staging.eu-west-1.hosts.yaml                    | 2 +-
 .github/ansible/staging.us-east-2.hosts.yaml                    | 2 +-
 .github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml    | 2 +-
 .github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml | 2 +-
 .github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml     | 2 +-
 .../helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml | 2 +-
 .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml    | 2 +-
 .github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml
index fce450ed39..f28dc8e07b 100644
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -18,7 +18,7 @@ storage:
     ansible_aws_ssm_region: eu-west-1
     ansible_aws_ssm_bucket_name: neon-dev-storage-eu-west-1
     console_region_id: aws-eu-west-1
-    sentry_environment: development
+    sentry_environment: staging
 
   children:
     pageservers:
diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index 1d1b8dbfa4..4891875369 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -18,7 +18,7 @@ storage:
     ansible_aws_ssm_region: us-east-2
     ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
     console_region_id: aws-us-east-2
-    sentry_environment: development
+    sentry_environment: staging
 
   children:
     pageservers:
diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
index 47924456ba..c49b8d2009 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -8,7 +8,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.eu-west-1.aws.neon.build"
-  sentryEnvironment: "development"
+  sentryEnvironment: "staging"
   wssPort: 8443
   metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
index c6e682f571..ccf701f52d 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
@@ -49,4 +49,4 @@ extraManifests:
           - "{{ .Release.Namespace }}"
 
 settings:
-  sentryEnvironment: "development"
+  sentryEnvironment: "staging"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
index eb8fd50c0f..cb062f705d 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -8,7 +8,7 @@ settings:
   authBackend: "link"
   authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
   uri: "https://console.stage.neon.tech/psql_session/"
-  sentryEnvironment: "development"
+  sentryEnvironment: "staging"
   metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
 
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
index 8a08738d5f..99b67d75c1 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -8,7 +8,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.cloud.stage.neon.tech"
-  sentryEnvironment: "development"
+  sentryEnvironment: "staging"
   wssPort: 8443
   metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
index b02d46917c..764bb25b64 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -8,7 +8,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.us-east-2.aws.neon.build"
-  sentryEnvironment: "development"
+  sentryEnvironment: "staging"
   wssPort: 8443
   metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
index c7682d24c0..69363c5f13 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
@@ -49,4 +49,4 @@ extraManifests:
           - "{{ .Release.Namespace }}"
 
 settings:
-  sentryEnvironment: "development"
+  sentryEnvironment: "staging"

From cb356f325978241e20a82703b04b784cdf8bb605 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Sat, 7 Jan 2023 00:23:35 +0200
Subject: [PATCH 24/63] Use actual temporary dir for pageserver unit tests

---
 .gitignore                                    |   2 -
 control_plane/.gitignore                      |   1 -
 pageserver/src/config.rs                      |   5 -
 pageserver/src/tenant.rs                      | 105 ++++++++----------
 pageserver/src/tenant/ephemeral_file.rs       |  36 +++---
 .../src/tenant/remote_timeline_client.rs      |   2 +-
 pageserver/src/virtual_file.rs                |   8 +-
 pageserver/src/walingest.rs                   |  12 +-
 .../src/walreceiver/connection_manager.rs     |  16 +--
 test_runner/sql_regress/.gitignore            |   1 -
 10 files changed, 80 insertions(+), 108 deletions(-)
 delete mode 100644 control_plane/.gitignore

diff --git a/.gitignore b/.gitignore
index f1afdee599..2e241ee8cd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,5 @@
 /pg_install
 /target
-/tmp_check
-/tmp_check_cli
 __pycache__/
 test_output/
 .vscode
diff --git a/control_plane/.gitignore b/control_plane/.gitignore
deleted file mode 100644
index c1e54a6bcb..0000000000
--- a/control_plane/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-tmp_check/
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 51d1664e52..f3e5fb8c1a 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -693,11 +693,6 @@ impl PageServerConf {
         Ok(t_conf)
     }
 
-    #[cfg(test)]
-    pub fn test_repo_dir(test_name: &str) -> PathBuf {
-        PathBuf::from(format!("../tmp_check/test_{test_name}"))
-    }
-
     pub fn dummy_conf(repo_dir: PathBuf) -> Self {
         let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1d0d6b66ab..0dd6735993 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2626,9 +2626,9 @@ where
 #[cfg(test)]
 pub mod harness {
     use bytes::{Bytes, BytesMut};
-    use once_cell::sync::Lazy;
-    use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
+    use std::sync::Arc;
     use std::{fs, path::PathBuf};
+    use tempfile::TempDir;
     use utils::lsn::Lsn;
 
     use crate::{
@@ -2659,8 +2659,6 @@ pub mod harness {
         buf.freeze()
     }
 
-    static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));
-
     impl From<TenantConf> for TenantConfOpt {
         fn from(tenant_conf: TenantConf) -> Self {
             Self {
@@ -2681,36 +2679,27 @@ pub mod harness {
         }
     }
 
-    pub struct TenantHarness<'a> {
+    /// The harness saves some boilerplate and provides a way to create functional tenant
+    /// without running pageserver binary. It uses temporary directory to store data in it.
+    /// Tempdir gets removed on harness drop.
+    pub struct TenantHarness {
+        // keep the struct to not to remove tmp dir during the test
+        _temp_repo_dir: TempDir,
         pub conf: &'static PageServerConf,
         pub tenant_conf: TenantConf,
         pub tenant_id: TenantId,
-
-        pub lock_guard: (
-            Option<RwLockReadGuard<'a, ()>>,
-            Option<RwLockWriteGuard<'a, ()>>,
-        ),
     }
 
-    impl<'a> TenantHarness<'a> {
-        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
-            Self::create_internal(test_name, false)
-        }
-        pub fn create_exclusive(test_name: &'static str) -> anyhow::Result<Self> {
-            Self::create_internal(test_name, true)
-        }
-        fn create_internal(test_name: &'static str, exclusive: bool) -> anyhow::Result<Self> {
-            let lock_guard = if exclusive {
-                (None, Some(LOCK.write().unwrap()))
-            } else {
-                (Some(LOCK.read().unwrap()), None)
-            };
+    static LOG_HANDLE: OnceCell<()> = OnceCell::new();
 
-            let repo_dir = PageServerConf::test_repo_dir(test_name);
-            let _ = fs::remove_dir_all(&repo_dir);
-            fs::create_dir_all(&repo_dir)?;
+    impl TenantHarness {
+        pub fn new() -> anyhow::Result<Self> {
+            let temp_repo_dir = tempfile::tempdir()?;
+            // `TempDir` uses a randomly generated subdirectory of a system tmp dir,
+            // so far it's enough to take care of concurrently running tests.
+            let repo_dir = temp_repo_dir.path();
 
-            let conf = PageServerConf::dummy_conf(repo_dir);
+            let conf = PageServerConf::dummy_conf(repo_dir.to_path_buf());
             // Make a static copy of the config. This can never be free'd, but that's
             // OK in a test.
             let conf: &'static PageServerConf = Box::leak(Box::new(conf));
@@ -2728,10 +2717,10 @@ pub mod harness {
             fs::create_dir_all(conf.timelines_path(&tenant_id))?;
 
             Ok(Self {
+                _temp_repo_dir: temp_repo_dir,
                 conf,
                 tenant_conf,
                 tenant_id,
-                lock_guard,
             })
         }
 
@@ -2825,7 +2814,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_basic() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_basic")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -2858,9 +2848,8 @@ mod tests {
 
     #[tokio::test]
     async fn no_duplicate_timelines() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("no_duplicate_timelines")?
-            .load()
-            .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let _ = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -2891,7 +2880,8 @@ mod tests {
     ///
     #[tokio::test]
     async fn test_branch() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_branch")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -2988,10 +2978,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
-        let tenant =
-            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
-                .load()
-                .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3026,9 +3014,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
-            .load()
-            .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
 
         tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?
@@ -3077,9 +3064,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
-            .load()
-            .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3101,9 +3087,8 @@ mod tests {
     }
     #[tokio::test]
     async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
-            .load()
-            .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3134,8 +3119,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeline_load() -> anyhow::Result<()> {
-        const TEST_NAME: &str = "timeline_load";
-        let harness = TenantHarness::create(TEST_NAME)?;
+        let harness = TenantHarness::new()?;
         {
             let tenant = harness.load().await;
             let tline = tenant
@@ -3154,8 +3138,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
-        const TEST_NAME: &str = "timeline_load_with_ancestor";
-        let harness = TenantHarness::create(TEST_NAME)?;
+        let harness = TenantHarness::new()?;
         // create two timelines
         {
             let tenant = harness.load().await;
@@ -3193,8 +3176,7 @@ mod tests {
 
     #[tokio::test]
     async fn corrupt_metadata() -> anyhow::Result<()> {
-        const TEST_NAME: &str = "corrupt_metadata";
-        let harness = TenantHarness::create(TEST_NAME)?;
+        let harness = TenantHarness::new()?;
         let tenant = harness.load().await;
 
         tenant
@@ -3235,7 +3217,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_images() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_images")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3302,7 +3285,8 @@ mod tests {
     //
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_bulk_insert")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3346,7 +3330,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_random_updates")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3419,9 +3404,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_branches() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_traverse_branches")?
-            .load()
-            .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let mut tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3505,9 +3489,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_ancestors() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_traverse_ancestors")?
-            .load()
-            .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let mut tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index c433e65ad2..0debeaff1c 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -76,7 +76,7 @@ impl EphemeralFile {
         })
     }
 
-    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), io::Error> {
+    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> io::Result<()> {
         let mut off = 0;
         while off < PAGE_SZ {
             let n = self
@@ -277,7 +277,7 @@ impl Drop for EphemeralFile {
     }
 }
 
-pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
+pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> io::Result<()> {
     if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
         match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
             Ok(_) => Ok(()),
@@ -332,25 +332,17 @@ mod tests {
     use super::*;
     use crate::tenant::blob_io::{BlobCursor, BlobWriter};
     use crate::tenant::block_io::BlockCursor;
+    use crate::tenant::harness::TenantHarness;
     use rand::{seq::SliceRandom, thread_rng, RngCore};
     use std::fs;
     use std::str::FromStr;
 
-    fn harness(
-        test_name: &str,
-    ) -> Result<(&'static PageServerConf, TenantId, TimelineId), io::Error> {
-        let repo_dir = PageServerConf::test_repo_dir(test_name);
-        let _ = fs::remove_dir_all(&repo_dir);
-        let conf = PageServerConf::dummy_conf(repo_dir);
-        // Make a static copy of the config. This can never be free'd, but that's
-        // OK in a test.
-        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
-
-        let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
+    fn harness() -> Result<(TenantHarness, TimelineId), io::Error> {
+        let harness = TenantHarness::new().expect("Failed to create tenant harness");
         let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
-        fs::create_dir_all(conf.timeline_path(&timeline_id, &tenant_id))?;
+        fs::create_dir_all(harness.timeline_path(&timeline_id))?;
 
-        Ok((conf, tenant_id, timeline_id))
+        Ok((harness, timeline_id))
     }
 
     // Helper function to slurp contents of a file, starting at the current position,
@@ -367,10 +359,10 @@ mod tests {
     }
 
     #[test]
-    fn test_ephemeral_files() -> Result<(), io::Error> {
-        let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?;
+    fn test_ephemeral_files() -> io::Result<()> {
+        let (harness, timeline_id) = harness()?;
 
-        let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?;
+        let file_a = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
 
         file_a.write_all_at(b"foo", 0)?;
         assert_eq!("foo", read_string(&file_a, 0, 20)?);
@@ -381,7 +373,7 @@ mod tests {
         // Open a lot of files, enough to cause some page evictions.
         let mut efiles = Vec::new();
         for fileno in 0..100 {
-            let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?;
+            let efile = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
             efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?;
             assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
             efiles.push((fileno, efile));
@@ -398,10 +390,10 @@ mod tests {
     }
 
     #[test]
-    fn test_ephemeral_blobs() -> Result<(), io::Error> {
-        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
+    fn test_ephemeral_blobs() -> io::Result<()> {
+        let (harness, timeline_id) = harness()?;
 
-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
+        let mut file = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
 
         let pos_foo = file.write_blob(b"foo")?;
         assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 013591caee..58b7eea1eb 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1064,7 +1064,7 @@ mod tests {
     // Test scheduling
     #[test]
     fn upload_scheduling() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("upload_scheduling")?;
+        let harness = TenantHarness::new()?;
         let timeline_path = harness.timeline_path(&TIMELINE_ID);
         std::fs::create_dir_all(&timeline_path)?;
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index fb216123c1..3ad049cc21 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -525,12 +525,13 @@ mod tests {
         })
     }
 
-    fn test_files<OF, FD>(testname: &str, openfunc: OF) -> Result<(), Error>
+    fn test_files<OF, FD>(test_name: &str, openfunc: OF) -> Result<(), Error>
     where
         FD: Read + Write + Seek + FileExt,
         OF: Fn(&Path, &OpenOptions) -> Result<FD, std::io::Error>,
     {
-        let testdir = crate::config::PageServerConf::test_repo_dir(testname);
+        let temp_repo_dir = tempfile::tempdir()?;
+        let testdir = temp_repo_dir.path().join(test_name);
         std::fs::create_dir_all(&testdir)?;
 
         let path_a = testdir.join("file_a");
@@ -632,7 +633,8 @@ mod tests {
         const THREADS: usize = 100;
         const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
 
-        let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
+        let temp_repo_dir = tempfile::tempdir()?;
+        let testdir = temp_repo_dir.path().join("vfile_concurrency");
         std::fs::create_dir_all(&testdir)?;
 
         // Create a test file.
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 0de2e6654d..77fce95160 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1146,7 +1146,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_relsize() -> Result<()> {
-        let tenant = TenantHarness::create("test_relsize")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
@@ -1323,7 +1324,8 @@ mod tests {
     // and then created it again within the same layer.
     #[tokio::test]
     async fn test_drop_extend() -> Result<()> {
-        let tenant = TenantHarness::create("test_drop_extend")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
@@ -1376,7 +1378,8 @@ mod tests {
     // and then extended it again within the same layer.
     #[tokio::test]
     async fn test_truncate_extend() -> Result<()> {
-        let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
@@ -1497,7 +1500,8 @@ mod tests {
     /// split into multiple 1 GB segments in Postgres.
     #[tokio::test]
     async fn test_large_rel() -> Result<()> {
-        let tenant = TenantHarness::create("test_large_rel")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs
index 8b60e59305..be58aa0e07 100644
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -846,7 +846,7 @@ mod tests {
 
     #[tokio::test]
     async fn no_connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("no_connection_no_candidate")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -879,7 +879,7 @@ mod tests {
 
     #[tokio::test]
     async fn connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("connection_no_candidate")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -942,7 +942,7 @@ mod tests {
 
     #[tokio::test]
     async fn no_connection_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("no_connection_candidate")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1001,7 +1001,7 @@ mod tests {
 
     #[tokio::test]
     async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("candidate_with_many_connection_failures")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1041,7 +1041,7 @@ mod tests {
 
     #[tokio::test]
     async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
@@ -1105,7 +1105,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
@@ -1166,7 +1166,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let new_lsn = Lsn(100_100).align();
@@ -1232,7 +1232,7 @@ mod tests {
 
     const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
 
-    async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
+    async fn dummy_state(harness: &TenantHarness) -> WalreceiverState {
         WalreceiverState {
             id: TenantTimelineId {
                 tenant_id: harness.tenant_id,
diff --git a/test_runner/sql_regress/.gitignore b/test_runner/sql_regress/.gitignore
index 89129d7358..83186b5c86 100644
--- a/test_runner/sql_regress/.gitignore
+++ b/test_runner/sql_regress/.gitignore
@@ -2,7 +2,6 @@
 /pg_regress
 
 # Generated subdirectories
-/tmp_check/
 /results/
 /log/
 

From ffca97bc1e10b6c351067b14bdee6bbc68369c4b Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Thu, 12 Jan 2023 15:14:04 +0200
Subject: [PATCH 25/63] Enable logs in unit tests

---
 libs/utils/src/logging.rs | 2 ++
 pageserver/src/tenant.rs  | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 3b1a1f5aff..82c9267f4a 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -8,6 +8,7 @@ use strum_macros::{EnumString, EnumVariantNames};
 pub enum LogFormat {
     Plain,
     Json,
+    Test,
 }
 
 impl LogFormat {
@@ -39,6 +40,7 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
     match log_format {
         LogFormat::Json => base_logger.json().init(),
         LogFormat::Plain => base_logger.init(),
+        LogFormat::Test => base_logger.with_test_writer().init(),
     }
 
     Ok(())
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0dd6735993..c53c9bc3e1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2626,9 +2626,11 @@ where
 #[cfg(test)]
 pub mod harness {
     use bytes::{Bytes, BytesMut};
+    use once_cell::sync::OnceCell;
     use std::sync::Arc;
     use std::{fs, path::PathBuf};
     use tempfile::TempDir;
+    use utils::logging;
     use utils::lsn::Lsn;
 
     use crate::{
@@ -2694,6 +2696,10 @@ pub mod harness {
 
     impl TenantHarness {
         pub fn new() -> anyhow::Result<Self> {
+            LOG_HANDLE.get_or_init(|| {
+                logging::init(logging::LogFormat::Test).expect("Failed to init test logging")
+            });
+
             let temp_repo_dir = tempfile::tempdir()?;
             // `TempDir` uses a randomly generated subdirectory of a system tmp dir,
             // so far it's enough to take care of concurrently running tests.

From 7b22b5c43321bf729caea4766c7f98b589c405ab Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Jan 2023 11:30:02 +0200
Subject: [PATCH 26/63] Switch to 'tracing' for logging, restructure code to
 make use of spans.

Refactors Compute::prepare_and_run. It's split into subroutines
differently, to make it easier to attach tracing spans to the
different stages. The high-level logic for waiting for Postgres to
exit is moved to the caller.

Replace 'env_logger' with 'tracing', and add `#instrument` directives
to different stages fo the startup process. This is a fairly
mechanical change, except for the changes in 'spec.rs'. 'spec.rs'
contained some complicated formatting, where parts of log messages
were printed directly to stdout with `print`s. That was a bit messed
up because the log normally goes to stderr, but those lines were
printed to stdout. In our docker images, stderr and stdout both go to
the same place so you wouldn't notice, but I don't think it was
intentional.

This changes the log format to the default
'tracing_subscriber::format' format. It's different from the Postgres
log format, however, and because both compute_tools and Postgres print
to the same log, it's now a mix of two different formats.  I'm not
sure how the Grafana log parsing pipeline can handle that. If it's a
problem, we can build custom formatter to change the compute_tools log
format to be the same as Postgres's, like it was before this commit,
or we can change the Postgres log format to match tracing_formatter's,
or we can start printing compute_tool's log output to a different
destination than Postgres
---
 Cargo.lock                              |   6 +-
 compute_tools/Cargo.toml                |   4 +-
 compute_tools/src/bin/compute_ctl.rs    |  55 +++++---
 compute_tools/src/checker.rs            |   4 +-
 compute_tools/src/compute.rs            |  77 ++++++-----
 compute_tools/src/http/api.rs           |   2 +-
 compute_tools/src/informant.rs          |   2 +-
 compute_tools/src/logger.rs             |  44 ++----
 compute_tools/src/monitor.rs            |   2 +-
 compute_tools/src/pg_helpers.rs         |  12 +-
 compute_tools/src/spec.rs               | 171 ++++++++++++++----------
 test_runner/regress/test_compute_ctl.py |   2 +-
 workspace_hack/Cargo.toml               |   5 +-
 13 files changed, 210 insertions(+), 176 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 59adf696a7..d8aba9ba68 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -833,10 +833,8 @@ dependencies = [
  "anyhow",
  "chrono",
  "clap 4.0.32",
- "env_logger",
  "futures",
  "hyper",
- "log",
  "notify",
  "postgres",
  "regex",
@@ -845,6 +843,8 @@ dependencies = [
  "tar",
  "tokio",
  "tokio-postgres",
+ "tracing",
+ "tracing-subscriber",
  "url",
  "workspace_hack",
 ]
@@ -1954,7 +1954,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
 dependencies = [
  "cfg-if",
- "serde",
 ]
 
 [[package]]
@@ -4565,6 +4564,7 @@ dependencies = [
  "tower",
  "tracing",
  "tracing-core",
+ "tracing-subscriber",
  "url",
 ]
 
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 1e0aee81d7..4536604bdf 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -8,10 +8,8 @@ license.workspace = true
 anyhow.workspace = true
 chrono.workspace = true
 clap.workspace = true
-env_logger.workspace = true
 futures.workspace = true
 hyper = { workspace = true, features = ["full"] }
-log = { workspace = true, features = ["std", "serde"] }
 notify.workspace = true
 postgres.workspace = true
 regex.workspace = true
@@ -20,6 +18,8 @@ serde_json.workspace = true
 tar.workspace = true
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
+tracing.workspace = true
+tracing-subscriber.workspace = true
 url.workspace = true
 
 workspace_hack.workspace = true
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 307300cfd8..e5ab8eb153 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -40,7 +40,7 @@ use std::{thread, time::Duration};
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
-use log::{error, info};
+use tracing::{error, info};
 
 use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
 use compute_tools::http::api::launch_http_server;
@@ -53,7 +53,6 @@ use compute_tools::spec::*;
 use url::Url;
 
 fn main() -> Result<()> {
-    // TODO: re-use `utils::logging` later
     init_logger(DEFAULT_LOG_LEVEL)?;
 
     let matches = cli().get_matches();
@@ -122,29 +121,45 @@ fn main() -> Result<()> {
     // Also spawn the thread responsible for handling the VM informant -- if it's present
     let _vm_informant_handle = spawn_vm_informant_if_present().expect("cannot launch VM informant");
 
-    // Run compute (Postgres) and hang waiting on it.
-    match compute.prepare_and_run() {
-        Ok(ec) => {
-            let code = ec.code().unwrap_or(1);
-            info!("Postgres exited with code {}, shutting down", code);
-            exit(code)
-        }
-        Err(error) => {
-            error!("could not start the compute node: {:?}", error);
-
+    // Start Postgres
+    let mut delay_exit = false;
+    let mut exit_code = None;
+    let pg = match compute.start_compute() {
+        Ok(pg) => Some(pg),
+        Err(err) => {
+            error!("could not start the compute node: {:?}", err);
             let mut state = compute.state.write().unwrap();
-            state.error = Some(format!("{:?}", error));
+            state.error = Some(format!("{:?}", err));
             state.status = ComputeStatus::Failed;
             drop(state);
-
-            // Keep serving HTTP requests, so the cloud control plane was able to
-            // get the actual error.
-            info!("giving control plane 30s to collect the error before shutdown");
-            thread::sleep(Duration::from_secs(30));
-            info!("shutting down");
-            Err(error)
+            delay_exit = true;
+            None
         }
+    };
+
+    // Wait for the child Postgres process forever. In this state Ctrl+C will
+    // propagate to Postgres and it will be shut down as well.
+    if let Some(mut pg) = pg {
+        let ecode = pg
+            .wait()
+            .expect("failed to start waiting on Postgres process");
+        info!("Postgres exited with code {}, shutting down", ecode);
+        exit_code = ecode.code()
     }
+
+    if let Err(err) = compute.check_for_core_dumps() {
+        error!("error while checking for core dumps: {err:?}");
+    }
+
+    // If launch failed, keep serving HTTP requests for a while, so the cloud
+    // control plane can get the actual error.
+    if delay_exit {
+        info!("giving control plane 30s to collect the error before shutdown");
+        thread::sleep(Duration::from_secs(30));
+        info!("shutting down");
+    }
+
+    exit(exit_code.unwrap_or(1))
 }
 
 fn cli() -> clap::Command {
diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs
index ee1605c814..b8413de516 100644
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,10 +1,11 @@
 use anyhow::{anyhow, Result};
-use log::error;
 use postgres::Client;
 use tokio_postgres::NoTls;
+use tracing::{error, instrument};
 
 use crate::compute::ComputeNode;
 
+#[instrument(skip_all)]
 pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
     let query = "
     CREATE TABLE IF NOT EXISTS health_check (
@@ -21,6 +22,7 @@ pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
     Ok(())
 }
 
+#[instrument(skip_all)]
 pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
     let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
     if client.is_closed() {
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index d652084e00..e229bb1cc2 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -17,15 +17,15 @@
 use std::fs;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
-use std::process::{Command, ExitStatus, Stdio};
+use std::process::{Command, Stdio};
 use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::RwLock;
 
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use log::{error, info, warn};
 use postgres::{Client, NoTls};
 use serde::{Serialize, Serializer};
+use tracing::{info, instrument, warn};
 
 use crate::checker::create_writability_check_data;
 use crate::config;
@@ -121,6 +121,7 @@ impl ComputeNode {
 
     // Get basebackup from the libpq connection to pageserver using `connstr` and
     // unarchive it to `pgdata` directory overriding all its previous content.
+    #[instrument(skip(self))]
     fn get_basebackup(&self, lsn: &str) -> Result<()> {
         let start_time = Utc::now();
 
@@ -154,6 +155,7 @@ impl ComputeNode {
 
     // Run `postgres` in a special mode with `--sync-safekeepers` argument
     // and return the reported LSN back to the caller.
+    #[instrument(skip(self))]
     fn sync_safekeepers(&self) -> Result<String> {
         let start_time = Utc::now();
 
@@ -196,6 +198,7 @@ impl ComputeNode {
 
     /// Do all the preparations like PGDATA directory creation, configuration,
     /// safekeepers sync, basebackup, etc.
+    #[instrument(skip(self))]
     pub fn prepare_pgdata(&self) -> Result<()> {
         let spec = &self.spec;
         let pgdata_path = Path::new(&self.pgdata);
@@ -229,9 +232,8 @@ impl ComputeNode {
 
     /// Start Postgres as a child process and manage DBs/roles.
     /// After that this will hang waiting on the postmaster process to exit.
-    pub fn run(&self) -> Result<ExitStatus> {
-        let start_time = Utc::now();
-
+    #[instrument(skip(self))]
+    pub fn start_postgres(&self) -> Result<std::process::Child> {
         let pgdata_path = Path::new(&self.pgdata);
 
         // Run postgres as a child process.
@@ -242,6 +244,11 @@ impl ComputeNode {
 
         wait_for_postgres(&mut pg, pgdata_path)?;
 
+        Ok(pg)
+    }
+
+    #[instrument(skip(self))]
+    pub fn apply_config(&self) -> Result<()> {
         // If connection fails,
         // it may be the old node with `zenith_admin` superuser.
         //
@@ -279,8 +286,34 @@ impl ComputeNode {
 
         // 'Close' connection
         drop(client);
-        let startup_end_time = Utc::now();
 
+        info!(
+            "finished configuration of compute for project {}",
+            self.spec.cluster.cluster_id
+        );
+
+        Ok(())
+    }
+
+    #[instrument(skip(self))]
+    pub fn start_compute(&self) -> Result<std::process::Child> {
+        info!(
+            "starting compute for project {}, operation {}, tenant {}, timeline {}",
+            self.spec.cluster.cluster_id,
+            self.spec.operation_uuid.as_ref().unwrap(),
+            self.tenant,
+            self.timeline,
+        );
+
+        self.prepare_pgdata()?;
+
+        let start_time = Utc::now();
+
+        let pg = self.start_postgres()?;
+
+        self.apply_config()?;
+
+        let startup_end_time = Utc::now();
         self.metrics.config_ms.store(
             startup_end_time
                 .signed_duration_since(start_time)
@@ -300,35 +333,7 @@ impl ComputeNode {
 
         self.set_status(ComputeStatus::Running);
 
-        info!(
-            "finished configuration of compute for project {}",
-            self.spec.cluster.cluster_id
-        );
-
-        // Wait for child Postgres process basically forever. In this state Ctrl+C
-        // will propagate to Postgres and it will be shut down as well.
-        let ecode = pg
-            .wait()
-            .expect("failed to start waiting on Postgres process");
-
-        if let Err(err) = self.check_for_core_dumps() {
-            error!("error while checking for core dumps: {err:?}");
-        }
-
-        Ok(ecode)
-    }
-
-    pub fn prepare_and_run(&self) -> Result<ExitStatus> {
-        info!(
-            "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            self.spec.cluster.cluster_id,
-            self.spec.operation_uuid.as_ref().unwrap(),
-            self.tenant,
-            self.timeline,
-        );
-
-        self.prepare_pgdata()?;
-        self.run()
+        Ok(pg)
     }
 
     // Look for core dumps and collect backtraces.
@@ -341,7 +346,7 @@ impl ComputeNode {
     //
     // Use that as a default location and pattern, except macos where core dumps are written
     // to /cores/ directory by default.
-    fn check_for_core_dumps(&self) -> Result<()> {
+    pub fn check_for_core_dumps(&self) -> Result<()> {
         let core_dump_dir = match std::env::consts::OS {
             "macos" => Path::new("/cores/"),
             _ => Path::new(&self.pgdata),
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 44f83e5003..f2a49f332c 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -6,8 +6,8 @@ use std::thread;
 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
-use log::{error, info};
 use serde_json;
+use tracing::{error, info};
 
 use crate::compute::ComputeNode;
 
diff --git a/compute_tools/src/informant.rs b/compute_tools/src/informant.rs
index 09bd5e3138..8a6e3ab43a 100644
--- a/compute_tools/src/informant.rs
+++ b/compute_tools/src/informant.rs
@@ -1,8 +1,8 @@
-use log::{info, warn};
 use std::path::Path;
 use std::process;
 use std::thread;
 use std::time::Duration;
+use tracing::{info, warn};
 
 use anyhow::{Context, Result};
 
diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs
index dde0a950f8..57e5496e86 100644
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -1,42 +1,20 @@
-use std::io::Write;
-
 use anyhow::Result;
-use chrono::Utc;
-use env_logger::{Builder, Env};
-
-macro_rules! info_println {
-    ($($tts:tt)*) => {
-        if log_enabled!(Level::Info) {
-            println!($($tts)*);
-        }
-    }
-}
-
-macro_rules! info_print {
-    ($($tts:tt)*) => {
-        if log_enabled!(Level::Info) {
-            print!($($tts)*);
-        }
-    }
-}
+use tracing_subscriber::layer::SubscriberExt;
+use tracing_subscriber::prelude::*;
 
 /// Initialize `env_logger` using either `default_level` or
 /// `RUST_LOG` environment variable as default log level.
 pub fn init_logger(default_level: &str) -> Result<()> {
-    let env = Env::default().filter_or("RUST_LOG", default_level);
+    let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
+        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_level));
 
-    Builder::from_env(env)
-        .format(|buf, record| {
-            let thread_handle = std::thread::current();
-            writeln!(
-                buf,
-                "{} [{}] {}: {}",
-                Utc::now().format("%Y-%m-%d %H:%M:%S%.3f %Z"),
-                thread_handle.name().unwrap_or("main"),
-                record.level(),
-                record.args()
-            )
-        })
+    let fmt_layer = tracing_subscriber::fmt::layer()
+        .with_target(false)
+        .with_writer(std::io::stderr);
+
+    tracing_subscriber::registry()
+        .with(env_filter)
+        .with(fmt_layer)
         .init();
 
     Ok(())
diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index c871422e78..7c9878ffcf 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -3,8 +3,8 @@ use std::{thread, time};
 
 use anyhow::Result;
 use chrono::{DateTime, Utc};
-use log::{debug, info};
 use postgres::{Client, NoTls};
+use tracing::{debug, info};
 
 use crate::compute::ComputeNode;
 
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index ff422f1cf5..921289d7c2 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -11,6 +11,7 @@ use anyhow::{bail, Result};
 use notify::{RecursiveMode, Watcher};
 use postgres::{Client, Transaction};
 use serde::Deserialize;
+use tracing::{debug, instrument};
 
 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
 
@@ -229,6 +230,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
 /// Wait for Postgres to become ready to accept connections. It's ready to
 /// accept connections when the state-field in `pgdata/postmaster.pid` says
 /// 'ready'.
+#[instrument(skip(pg))]
 pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
     let pid_path = pgdata.join("postmaster.pid");
 
@@ -287,18 +289,18 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
         }
 
         let res = rx.recv_timeout(Duration::from_millis(100));
-        log::debug!("woken up by notify: {res:?}");
+        debug!("woken up by notify: {res:?}");
         // If there are multiple events in the channel already, we only need to be
         // check once. Swallow the extra events before we go ahead to check the
         // pid file.
         while let Ok(res) = rx.try_recv() {
-            log::debug!("swallowing extra event: {res:?}");
+            debug!("swallowing extra event: {res:?}");
         }
 
         // Check that we can open pid file first.
         if let Ok(file) = File::open(&pid_path) {
             if !postmaster_pid_seen {
-                log::debug!("postmaster.pid appeared");
+                debug!("postmaster.pid appeared");
                 watcher
                     .unwatch(pgdata)
                     .expect("Failed to remove pgdata dir watch");
@@ -314,7 +316,7 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
             // Pid file could be there and we could read it, but it could be empty, for example.
             if let Some(Ok(line)) = last_line {
                 let status = line.trim();
-                log::debug!("last line of postmaster.pid: {status:?}");
+                debug!("last line of postmaster.pid: {status:?}");
 
                 // Now Postgres is ready to accept connections
                 if status == "ready" {
@@ -330,7 +332,7 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
         }
     }
 
-    log::info!("PostgreSQL is now running, continuing to configure it");
+    tracing::info!("PostgreSQL is now running, continuing to configure it");
 
     Ok(())
 }
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 81e01fe555..40c8366bf4 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,12 +1,11 @@
 use std::path::Path;
 use std::str::FromStr;
-use std::time::Instant;
 
 use anyhow::Result;
-use log::{info, log_enabled, warn, Level};
 use postgres::config::Config;
 use postgres::{Client, NoTls};
 use serde::Deserialize;
+use tracing::{info, info_span, instrument, span_enabled, warn, Level};
 
 use crate::compute::ComputeNode;
 use crate::config;
@@ -80,23 +79,25 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
 
 /// Given a cluster spec json and open transaction it handles roles creation,
 /// deletion and update.
+#[instrument(skip_all)]
 pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
     let mut xact = client.transaction()?;
     let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
 
     // Print a list of existing Postgres roles (only in debug mode)
-    info!("postgres roles:");
-    for r in &existing_roles {
-        info_println!(
-            "{} - {}:{}",
-            " ".repeat(27 + 5),
-            r.name,
-            if r.encrypted_password.is_some() {
-                "[FILTERED]"
-            } else {
-                "(null)"
-            }
-        );
+    if span_enabled!(Level::INFO) {
+        info!("postgres roles:");
+        for r in &existing_roles {
+            info!(
+                "    - {}:{}",
+                r.name,
+                if r.encrypted_password.is_some() {
+                    "[FILTERED]"
+                } else {
+                    "(null)"
+                }
+            );
+        }
     }
 
     // Process delta operations first
@@ -137,58 +138,68 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
     info!("cluster spec roles:");
     for role in &spec.cluster.roles {
         let name = &role.name;
-
-        info_print!(
-            "{} - {}:{}",
-            " ".repeat(27 + 5),
-            name,
-            if role.encrypted_password.is_some() {
-                "[FILTERED]"
-            } else {
-                "(null)"
-            }
-        );
-
         // XXX: with a limited number of roles it is fine, but consider making it a HashMap
         let pg_role = existing_roles.iter().find(|r| r.name == *name);
 
-        if let Some(r) = pg_role {
-            let mut update_role = false;
-
+        enum RoleAction {
+            None,
+            Update,
+            Create,
+        }
+        let action = if let Some(r) = pg_role {
             if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
                 || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
             {
-                update_role = true;
+                RoleAction::Update
             } else if let Some(pg_pwd) = &r.encrypted_password {
                 // Check whether password changed or not (trim 'md5:' prefix first)
-                update_role = pg_pwd[3..] != *role.encrypted_password.as_ref().unwrap();
+                if pg_pwd[3..] != *role.encrypted_password.as_ref().unwrap() {
+                    RoleAction::Update
+                } else {
+                    RoleAction::None
+                }
+            } else {
+                RoleAction::None
             }
+        } else {
+            RoleAction::Create
+        };
 
-            if update_role {
+        match action {
+            RoleAction::None => {}
+            RoleAction::Update => {
                 let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
-                info_print!(" -> update");
-
                 query.push_str(&role.to_pg_options());
                 xact.execute(query.as_str(), &[])?;
             }
-        } else {
-            info!("role name: '{}'", &name);
-            let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
-            info!("role create query: '{}'", &query);
-            info_print!(" -> create");
+            RoleAction::Create => {
+                let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
+                info!("role create query: '{}'", &query);
+                query.push_str(&role.to_pg_options());
+                xact.execute(query.as_str(), &[])?;
 
-            query.push_str(&role.to_pg_options());
-            xact.execute(query.as_str(), &[])?;
-
-            let grant_query = format!(
-                "GRANT pg_read_all_data, pg_write_all_data TO {}",
-                name.pg_quote()
-            );
-            xact.execute(grant_query.as_str(), &[])?;
-            info!("role grant query: '{}'", &grant_query);
+                let grant_query = format!(
+                    "GRANT pg_read_all_data, pg_write_all_data TO {}",
+                    name.pg_quote()
+                );
+                xact.execute(grant_query.as_str(), &[])?;
+                info!("role grant query: '{}'", &grant_query);
+            }
         }
 
-        info_print!("\n");
+        if span_enabled!(Level::INFO) {
+            let pwd = if role.encrypted_password.is_some() {
+                "[FILTERED]"
+            } else {
+                "(null)"
+            };
+            let action_str = match action {
+                RoleAction::None => "",
+                RoleAction::Create => " -> create",
+                RoleAction::Update => " -> update",
+            };
+            info!("   - {}:{}{}", name, pwd, action_str);
+        }
     }
 
     xact.commit()?;
@@ -197,6 +208,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 }
 
 /// Reassign all dependent objects and delete requested roles.
+#[instrument(skip_all)]
 pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
     if let Some(ops) = &node.spec.delta_operations {
         // First, reassign all dependent objects to db owners.
@@ -261,13 +273,16 @@ fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()>
 /// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
 /// atomicity should be enough here due to the order of operations and various checks,
 /// which together provide us idempotency.
+#[instrument(skip_all)]
 pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
     let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
 
     // Print a list of existing Postgres databases (only in debug mode)
-    info!("postgres databases:");
-    for r in &existing_dbs {
-        info_println!("{} - {}:{}", " ".repeat(27 + 5), r.name, r.owner);
+    if span_enabled!(Level::INFO) {
+        info!("postgres databases:");
+        for r in &existing_dbs {
+            info!("    {}:{}", r.name, r.owner);
+        }
     }
 
     // Process delta operations first
@@ -310,13 +325,15 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
     for db in &spec.cluster.databases {
         let name = &db.name;
 
-        info_print!("{} - {}:{}", " ".repeat(27 + 5), db.name, db.owner);
-
         // XXX: with a limited number of databases it is fine, but consider making it a HashMap
         let pg_db = existing_dbs.iter().find(|r| r.name == *name);
 
-        let start_time = Instant::now();
-        if let Some(r) = pg_db {
+        enum DatabaseAction {
+            None,
+            Update,
+            Create,
+        }
+        let action = if let Some(r) = pg_db {
             // XXX: db owner name is returned as quoted string from Postgres,
             // when quoting is needed.
             let new_owner = if r.owner.starts_with('"') {
@@ -326,29 +343,42 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
             };
 
             if new_owner != r.owner {
+                // Update the owner
+                DatabaseAction::Update
+            } else {
+                DatabaseAction::None
+            }
+        } else {
+            DatabaseAction::Create
+        };
+
+        match action {
+            DatabaseAction::None => {}
+            DatabaseAction::Update => {
                 let query: String = format!(
                     "ALTER DATABASE {} OWNER TO {}",
                     name.pg_quote(),
                     db.owner.pg_quote()
                 );
-                info_print!(" -> update");
-
+                let _ = info_span!("executing", query).entered();
                 client.execute(query.as_str(), &[])?;
-                let elapsed = start_time.elapsed().as_millis();
-                info_print!(" ({} ms)", elapsed);
             }
-        } else {
-            let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
-            info_print!(" -> create");
+            DatabaseAction::Create => {
+                let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
+                query.push_str(&db.to_pg_options());
+                let _ = info_span!("executing", query).entered();
+                client.execute(query.as_str(), &[])?;
+            }
+        };
 
-            query.push_str(&db.to_pg_options());
-            client.execute(query.as_str(), &[])?;
-
-            let elapsed = start_time.elapsed().as_millis();
-            info_print!(" ({} ms)", elapsed);
+        if span_enabled!(Level::INFO) {
+            let action_str = match action {
+                DatabaseAction::None => "",
+                DatabaseAction::Create => " -> create",
+                DatabaseAction::Update => " -> update",
+            };
+            info!("   - {}:{}{}", db.name, db.owner, action_str);
         }
-
-        info_print!("\n");
     }
 
     Ok(())
@@ -356,6 +386,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
+#[instrument(skip_all)]
 pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
     let spec = &node.spec;
 
diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py
index f973bd8e60..05ac3841dc 100644
--- a/test_runner/regress/test_compute_ctl.py
+++ b/test_runner/regress/test_compute_ctl.py
@@ -194,7 +194,7 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
         )
     except TimeoutExpired as exc:
         ctl_logs = (exc.stderr or b"").decode("utf-8")
-        log.info("compute_ctl output:\n{ctl_logs}")
+        log.info(f"compute_ctl stderr:\n{ctl_logs}")
 
     with ExternalProcessManager(Path(pgdata) / "postmaster.pid"):
         start = "starting safekeepers syncing"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 8addfcf72e..f4b71ae9b7 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -26,7 +26,7 @@ futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
-log = { version = "0.4", default-features = false, features = ["serde", "std"] }
+log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
@@ -45,6 +45,7 @@ tokio-util = { version = "0.7", features = ["codec", "io"] }
 tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
+tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
 url = { version = "2", features = ["serde"] }
 
 [build-dependencies]
@@ -54,7 +55,7 @@ either = { version = "1" }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
-log = { version = "0.4", default-features = false, features = ["serde", "std"] }
+log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
 prost = { version = "0.11" }

From 300da5b872e105e404eb0842c9302f5742fb164b Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Thu, 19 Jan 2023 10:29:15 -0500
Subject: [PATCH 27/63] Improve layer map docstrings (#3382)

---
 pageserver/src/tenant/layer_map.rs | 52 ++++++++++++++++++++++--------
 1 file changed, 39 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 44bed5959f..01c5359e88 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -250,15 +250,32 @@ where
     L: ?Sized + Layer,
 {
     ///
-    /// Find the latest layer that covers the given 'key', with lsn <
-    /// 'end_lsn'.
+    /// Find the latest layer (by lsn.end) that covers the given
+    /// 'key', with lsn.start < 'end_lsn'.
     ///
-    /// Returns the layer, if any, and an 'lsn_floor' value that
-    /// indicates which portion of the layer the caller should
-    /// check. 'lsn_floor' is normally the start-LSN of the layer, but
-    /// can be greater if there is an overlapping layer that might
-    /// contain the version, even if it's missing from the returned
-    /// layer.
+    /// The caller of this function is the page reconstruction
+    /// algorithm looking for the next relevant delta layer, or
+    /// the terminal image layer. The caller will pass the lsn_floor
+    /// value as end_lsn in the next call to search.
+    ///
+    /// If there's an image layer exactly below the given end_lsn,
+    /// search should return that layer regardless if there are
+    /// overlapping deltas.
+    ///
+    /// If the latest layer is a delta and there is an overlapping
+    /// image with it below, the lsn_floor returned should be right
+    /// above that image so we don't skip it in the search. Otherwise
+    /// the lsn_floor returned should be the bottom of the delta layer
+    /// because we should make as much progress down the lsn axis
+    /// as possible. It's fine if this way we skip some overlapping
+    /// deltas, because the delta we returned would contain the same
+    /// wal content.
+    ///
+    /// TODO: This API is convoluted and inefficient. If the caller
+    /// makes N search calls, we'll end up finding the same latest
+    /// image layer N times. We should either cache the latest image
+    /// layer result, or simplify the api to `get_latest_image` and
+    /// `get_latest_delta`, and only call `get_latest_image` once.
     ///
     /// NOTE: This only searches the 'historic' layers, *not* the
     /// 'open' and 'frozen' layers!
@@ -401,7 +418,9 @@ where
         NUM_ONDISK_LAYERS.dec();
     }
 
-    /// Is there a newer image layer for given key- and LSN-range?
+    /// Is there a newer image layer for given key- and LSN-range? Or a set
+    /// of image layers within the specified lsn range that cover the entire
+    /// specified key range?
     ///
     /// This is used for garbage collection, to determine if an old layer can
     /// be deleted.
@@ -488,8 +507,8 @@ where
 
     ///
     /// Divide the whole given range of keys into sub-ranges based on the latest
-    /// image layer that covers each range. (This is used when creating  new
-    /// image layers)
+    /// image layer that covers each range at the specified lsn (inclusive).
+    /// This is used when creating  new image layers.
     ///
     // FIXME: clippy complains that the result type is very complex. She's probably
     // right...
@@ -541,8 +560,15 @@ where
         Ok(ranges)
     }
 
-    /// Count how many L1 delta layers there are that overlap with the
-    /// given key and LSN range.
+    /// Count the height of the tallest stack of deltas in this 2d region.
+    ///
+    /// This number is used to compute the largest number of deltas that
+    /// we'll need to visit for any page reconstruction in this region.
+    /// We use this heuristic to decide whether to create an image layer.
+    ///
+    /// TODO currently we just return the total number of deltas in the
+    ///      region, no matter if they're stacked on top of each other
+    ///      or next to each other.
     pub fn count_deltas(&self, key_range: &Range<Key>, lsn_range: &Range<Lsn>) -> Result<usize> {
         let mut result = 0;
         if lsn_range.start >= lsn_range.end {

From 262265daad0d1a000d1f425d71ddfb723c3a05e8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 19 Jan 2023 18:49:36 +0100
Subject: [PATCH 28/63] Revert "Use actual temporary dir for pageserver unit
 tests"

This reverts commit 826e89b9ce43ce2c4d046b2c5d6376c3de8dbbac.

The problem with that commit was that it deletes the TempDir while
there are still EphemeralFile instances open.

At first I thought this could be fixed by simply adding

  Handle::current().block_on(task_mgr::shutdown(None, Some(tenant_id), None))

to TenantHarness::drop, but it turned out to be insufficient.

So, reverting the commit until we find a proper solution.

refs https://github.com/neondatabase/neon/issues/3385
---
 .gitignore                                    |   2 +
 control_plane/.gitignore                      |   1 +
 pageserver/src/config.rs                      |   5 +
 pageserver/src/tenant.rs                      | 105 +++++++++++-------
 pageserver/src/tenant/ephemeral_file.rs       |  38 ++++---
 .../src/tenant/remote_timeline_client.rs      |   2 +-
 pageserver/src/virtual_file.rs                |   8 +-
 pageserver/src/walingest.rs                   |  12 +-
 .../src/walreceiver/connection_manager.rs     |  16 +--
 test_runner/sql_regress/.gitignore            |   1 +
 10 files changed, 110 insertions(+), 80 deletions(-)
 create mode 100644 control_plane/.gitignore

diff --git a/.gitignore b/.gitignore
index 2e241ee8cd..f1afdee599 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
 /pg_install
 /target
+/tmp_check
+/tmp_check_cli
 __pycache__/
 test_output/
 .vscode
diff --git a/control_plane/.gitignore b/control_plane/.gitignore
new file mode 100644
index 0000000000..c1e54a6bcb
--- /dev/null
+++ b/control_plane/.gitignore
@@ -0,0 +1 @@
+tmp_check/
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index f3e5fb8c1a..51d1664e52 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -693,6 +693,11 @@ impl PageServerConf {
         Ok(t_conf)
     }
 
+    #[cfg(test)]
+    pub fn test_repo_dir(test_name: &str) -> PathBuf {
+        PathBuf::from(format!("../tmp_check/test_{test_name}"))
+    }
+
     pub fn dummy_conf(repo_dir: PathBuf) -> Self {
         let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c53c9bc3e1..c18c645e5b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2626,10 +2626,10 @@ where
 #[cfg(test)]
 pub mod harness {
     use bytes::{Bytes, BytesMut};
+    use once_cell::sync::Lazy;
     use once_cell::sync::OnceCell;
-    use std::sync::Arc;
+    use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
     use std::{fs, path::PathBuf};
-    use tempfile::TempDir;
     use utils::logging;
     use utils::lsn::Lsn;
 
@@ -2661,6 +2661,8 @@ pub mod harness {
         buf.freeze()
     }
 
+    static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));
+
     impl From<TenantConf> for TenantConfOpt {
         fn from(tenant_conf: TenantConf) -> Self {
             Self {
@@ -2681,31 +2683,42 @@ pub mod harness {
         }
     }
 
-    /// The harness saves some boilerplate and provides a way to create functional tenant
-    /// without running pageserver binary. It uses temporary directory to store data in it.
-    /// Tempdir gets removed on harness drop.
-    pub struct TenantHarness {
-        // keep the struct to not to remove tmp dir during the test
-        _temp_repo_dir: TempDir,
+    pub struct TenantHarness<'a> {
         pub conf: &'static PageServerConf,
         pub tenant_conf: TenantConf,
         pub tenant_id: TenantId,
+
+        pub lock_guard: (
+            Option<RwLockReadGuard<'a, ()>>,
+            Option<RwLockWriteGuard<'a, ()>>,
+        ),
     }
 
     static LOG_HANDLE: OnceCell<()> = OnceCell::new();
 
-    impl TenantHarness {
-        pub fn new() -> anyhow::Result<Self> {
+    impl<'a> TenantHarness<'a> {
+        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
+            Self::create_internal(test_name, false)
+        }
+        pub fn create_exclusive(test_name: &'static str) -> anyhow::Result<Self> {
+            Self::create_internal(test_name, true)
+        }
+        fn create_internal(test_name: &'static str, exclusive: bool) -> anyhow::Result<Self> {
+            let lock_guard = if exclusive {
+                (None, Some(LOCK.write().unwrap()))
+            } else {
+                (Some(LOCK.read().unwrap()), None)
+            };
+
             LOG_HANDLE.get_or_init(|| {
                 logging::init(logging::LogFormat::Test).expect("Failed to init test logging")
             });
 
-            let temp_repo_dir = tempfile::tempdir()?;
-            // `TempDir` uses a randomly generated subdirectory of a system tmp dir,
-            // so far it's enough to take care of concurrently running tests.
-            let repo_dir = temp_repo_dir.path();
+            let repo_dir = PageServerConf::test_repo_dir(test_name);
+            let _ = fs::remove_dir_all(&repo_dir);
+            fs::create_dir_all(&repo_dir)?;
 
-            let conf = PageServerConf::dummy_conf(repo_dir.to_path_buf());
+            let conf = PageServerConf::dummy_conf(repo_dir);
             // Make a static copy of the config. This can never be free'd, but that's
             // OK in a test.
             let conf: &'static PageServerConf = Box::leak(Box::new(conf));
@@ -2723,10 +2736,10 @@ pub mod harness {
             fs::create_dir_all(conf.timelines_path(&tenant_id))?;
 
             Ok(Self {
-                _temp_repo_dir: temp_repo_dir,
                 conf,
                 tenant_conf,
                 tenant_id,
+                lock_guard,
             })
         }
 
@@ -2820,8 +2833,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_basic() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_basic")?.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -2854,8 +2866,9 @@ mod tests {
 
     #[tokio::test]
     async fn no_duplicate_timelines() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("no_duplicate_timelines")?
+            .load()
+            .await;
         let _ = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -2886,8 +2899,7 @@ mod tests {
     ///
     #[tokio::test]
     async fn test_branch() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_branch")?.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -2984,8 +2996,10 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant =
+            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
+                .load()
+                .await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3020,8 +3034,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
+            .load()
+            .await;
 
         tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?
@@ -3070,8 +3085,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
+            .load()
+            .await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3093,8 +3109,9 @@ mod tests {
     }
     #[tokio::test]
     async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
+            .load()
+            .await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3125,7 +3142,8 @@ mod tests {
 
     #[tokio::test]
     async fn timeline_load() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        const TEST_NAME: &str = "timeline_load";
+        let harness = TenantHarness::create(TEST_NAME)?;
         {
             let tenant = harness.load().await;
             let tline = tenant
@@ -3144,7 +3162,8 @@ mod tests {
 
     #[tokio::test]
     async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        const TEST_NAME: &str = "timeline_load_with_ancestor";
+        let harness = TenantHarness::create(TEST_NAME)?;
         // create two timelines
         {
             let tenant = harness.load().await;
@@ -3182,7 +3201,8 @@ mod tests {
 
     #[tokio::test]
     async fn corrupt_metadata() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        const TEST_NAME: &str = "corrupt_metadata";
+        let harness = TenantHarness::create(TEST_NAME)?;
         let tenant = harness.load().await;
 
         tenant
@@ -3223,8 +3243,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_images() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_images")?.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3291,8 +3310,7 @@ mod tests {
     //
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_bulk_insert")?.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3336,8 +3354,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_random_updates")?.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3410,8 +3427,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_branches() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_traverse_branches")?
+            .load()
+            .await;
         let mut tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3495,8 +3513,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_ancestors() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_traverse_ancestors")?
+            .load()
+            .await;
         let mut tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 0debeaff1c..c433e65ad2 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -76,7 +76,7 @@ impl EphemeralFile {
         })
     }
 
-    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> io::Result<()> {
+    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), io::Error> {
         let mut off = 0;
         while off < PAGE_SZ {
             let n = self
@@ -277,7 +277,7 @@ impl Drop for EphemeralFile {
     }
 }
 
-pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> io::Result<()> {
+pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
     if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
         match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
             Ok(_) => Ok(()),
@@ -332,17 +332,25 @@ mod tests {
     use super::*;
     use crate::tenant::blob_io::{BlobCursor, BlobWriter};
     use crate::tenant::block_io::BlockCursor;
-    use crate::tenant::harness::TenantHarness;
     use rand::{seq::SliceRandom, thread_rng, RngCore};
     use std::fs;
     use std::str::FromStr;
 
-    fn harness() -> Result<(TenantHarness, TimelineId), io::Error> {
-        let harness = TenantHarness::new().expect("Failed to create tenant harness");
-        let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
-        fs::create_dir_all(harness.timeline_path(&timeline_id))?;
+    fn harness(
+        test_name: &str,
+    ) -> Result<(&'static PageServerConf, TenantId, TimelineId), io::Error> {
+        let repo_dir = PageServerConf::test_repo_dir(test_name);
+        let _ = fs::remove_dir_all(&repo_dir);
+        let conf = PageServerConf::dummy_conf(repo_dir);
+        // Make a static copy of the config. This can never be free'd, but that's
+        // OK in a test.
+        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
 
-        Ok((harness, timeline_id))
+        let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
+        let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
+        fs::create_dir_all(conf.timeline_path(&timeline_id, &tenant_id))?;
+
+        Ok((conf, tenant_id, timeline_id))
     }
 
     // Helper function to slurp contents of a file, starting at the current position,
@@ -359,10 +367,10 @@ mod tests {
     }
 
     #[test]
-    fn test_ephemeral_files() -> io::Result<()> {
-        let (harness, timeline_id) = harness()?;
+    fn test_ephemeral_files() -> Result<(), io::Error> {
+        let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?;
 
-        let file_a = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
+        let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?;
 
         file_a.write_all_at(b"foo", 0)?;
         assert_eq!("foo", read_string(&file_a, 0, 20)?);
@@ -373,7 +381,7 @@ mod tests {
         // Open a lot of files, enough to cause some page evictions.
         let mut efiles = Vec::new();
         for fileno in 0..100 {
-            let efile = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
+            let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?;
             efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?;
             assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
             efiles.push((fileno, efile));
@@ -390,10 +398,10 @@ mod tests {
     }
 
     #[test]
-    fn test_ephemeral_blobs() -> io::Result<()> {
-        let (harness, timeline_id) = harness()?;
+    fn test_ephemeral_blobs() -> Result<(), io::Error> {
+        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
 
-        let mut file = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
 
         let pos_foo = file.write_blob(b"foo")?;
         assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 58b7eea1eb..013591caee 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1064,7 +1064,7 @@ mod tests {
     // Test scheduling
     #[test]
     fn upload_scheduling() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("upload_scheduling")?;
         let timeline_path = harness.timeline_path(&TIMELINE_ID);
         std::fs::create_dir_all(&timeline_path)?;
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 3ad049cc21..fb216123c1 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -525,13 +525,12 @@ mod tests {
         })
     }
 
-    fn test_files<OF, FD>(test_name: &str, openfunc: OF) -> Result<(), Error>
+    fn test_files<OF, FD>(testname: &str, openfunc: OF) -> Result<(), Error>
     where
         FD: Read + Write + Seek + FileExt,
         OF: Fn(&Path, &OpenOptions) -> Result<FD, std::io::Error>,
     {
-        let temp_repo_dir = tempfile::tempdir()?;
-        let testdir = temp_repo_dir.path().join(test_name);
+        let testdir = crate::config::PageServerConf::test_repo_dir(testname);
         std::fs::create_dir_all(&testdir)?;
 
         let path_a = testdir.join("file_a");
@@ -633,8 +632,7 @@ mod tests {
         const THREADS: usize = 100;
         const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
 
-        let temp_repo_dir = tempfile::tempdir()?;
-        let testdir = temp_repo_dir.path().join("vfile_concurrency");
+        let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
         std::fs::create_dir_all(&testdir)?;
 
         // Create a test file.
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 77fce95160..0de2e6654d 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1146,8 +1146,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_relsize() -> Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_relsize")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
@@ -1324,8 +1323,7 @@ mod tests {
     // and then created it again within the same layer.
     #[tokio::test]
     async fn test_drop_extend() -> Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_drop_extend")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
@@ -1378,8 +1376,7 @@ mod tests {
     // and then extended it again within the same layer.
     #[tokio::test]
     async fn test_truncate_extend() -> Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
@@ -1500,8 +1497,7 @@ mod tests {
     /// split into multiple 1 GB segments in Postgres.
     #[tokio::test]
     async fn test_large_rel() -> Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_large_rel")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs
index be58aa0e07..8b60e59305 100644
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -846,7 +846,7 @@ mod tests {
 
     #[tokio::test]
     async fn no_connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("no_connection_no_candidate")?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -879,7 +879,7 @@ mod tests {
 
     #[tokio::test]
     async fn connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("connection_no_candidate")?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -942,7 +942,7 @@ mod tests {
 
     #[tokio::test]
     async fn no_connection_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("no_connection_candidate")?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1001,7 +1001,7 @@ mod tests {
 
     #[tokio::test]
     async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("candidate_with_many_connection_failures")?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1041,7 +1041,7 @@ mod tests {
 
     #[tokio::test]
     async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
@@ -1105,7 +1105,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
@@ -1166,7 +1166,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let new_lsn = Lsn(100_100).align();
@@ -1232,7 +1232,7 @@ mod tests {
 
     const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
 
-    async fn dummy_state(harness: &TenantHarness) -> WalreceiverState {
+    async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
         WalreceiverState {
             id: TenantTimelineId {
                 tenant_id: harness.tenant_id,
diff --git a/test_runner/sql_regress/.gitignore b/test_runner/sql_regress/.gitignore
index 83186b5c86..89129d7358 100644
--- a/test_runner/sql_regress/.gitignore
+++ b/test_runner/sql_regress/.gitignore
@@ -2,6 +2,7 @@
 /pg_regress
 
 # Generated subdirectories
+/tmp_check/
 /results/
 /log/
 

From 47f9890bae75c63fc4b29c009cf9020ac2bcbafa Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 20 Jan 2023 15:37:24 +0100
Subject: [PATCH 29/63] [compute_ctl] Make role deletion spec processing
 idempotent (#3380)

Previously, we were trying to re-assign owned objects of the already
deleted role. This were causing a crash loop in the case when compute
was restarted with a spec that includes delta operation for role
deletion. To avoid such cases, check that role is still present before
calling `reassign_owned_objects`.

Resolves neondatabase/cloud#3553
---
 compute_tools/src/compute.rs    |  3 ++-
 compute_tools/src/pg_helpers.rs |  4 ++--
 compute_tools/src/spec.rs       | 14 +++++++++++++-
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index e229bb1cc2..c8af8822b7 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -252,7 +252,7 @@ impl ComputeNode {
         // If connection fails,
         // it may be the old node with `zenith_admin` superuser.
         //
-        // In this case we need to connect with old `zenith_admin`name
+        // In this case we need to connect with old `zenith_admin` name
         // and create new user. We cannot simply rename connected user,
         // but we can create a new one and grant it all privileges.
         let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
@@ -278,6 +278,7 @@ impl ComputeNode {
             Ok(client) => client,
         };
 
+        // Proceed with post-startup configuration. Note, that order of operations is important.
         handle_roles(&self.spec, &mut client)?;
         handle_databases(&self.spec, &mut client)?;
         handle_role_deletions(self, &mut client)?;
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 921289d7c2..6ab2864721 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -130,8 +130,8 @@ impl Role {
     /// Serialize a list of role parameters into a Postgres-acceptable
     /// string of arguments.
     pub fn to_pg_options(&self) -> String {
-        // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in Rails.
-        // For now we do not use generic `options` for roles. Once used, add
+        // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
+        // For now, we do not use generic `options` for roles. Once used, add
         // `self.options.as_pg_options()` somewhere here.
         let mut params: String = "LOGIN".to_string();
 
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 40c8366bf4..97cd623052 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -213,8 +213,20 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
     if let Some(ops) = &node.spec.delta_operations {
         // First, reassign all dependent objects to db owners.
         info!("reassigning dependent objects of to-be-deleted roles");
+
+        // Fetch existing roles. We could've exported and used `existing_roles` from
+        // `handle_roles()`, but we only make this list there before creating new roles.
+        // Which is probably fine as we never create to-be-deleted roles, but that'd
+        // just look a bit untidy. Anyway, the entire `pg_roles` should be in shared
+        // buffers already, so this shouldn't be a big deal.
+        let mut xact = client.transaction()?;
+        let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
+        xact.commit()?;
+
         for op in ops {
-            if op.action == "delete_role" {
+            // Check that role is still present in Postgres, as this could be a
+            // restart with the same spec after role deletion.
+            if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) {
                 reassign_owned_objects(node, &op.name)?;
             }
         }

From 802f17407259bf9ad027480721083662f66b13b1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 20 Jan 2023 18:19:52 +0200
Subject: [PATCH 30/63] fix: dont stop pageserver if we fail to calculate
 synthetic size

---
 pageserver/src/consumption_metrics.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index f8a0bc6f08..c07026261d 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -59,7 +59,7 @@ pub async fn collect_metrics(
         None,
         None,
         "synthetic size calculation",
-        true,
+        false,
         async move {
             calculate_synthetic_size_worker(synthetic_size_calculation_interval)
                 .instrument(info_span!("synthetic_size_worker"))

From 478322ebf90f8580258b02e1fb5c899c7f8ad279 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 20 Jan 2023 20:21:36 +0200
Subject: [PATCH 31/63] Fix tenant size orphans (#3377)

Before only the timelines which have passed the `gc_horizon` were
processed which failed with orphans at the tree_sort phase. Example
input in added `test_branched_empty_timeline_size` test case.

The PR changes iteration to happen through all timelines, and in
addition to that, any learned branch points will be calculated as they
would had been in the original implementation if the ancestor branch had
been over the `gc_horizon`.

This also changes how tenants where all timelines are below `gc_horizon`
are handled. Previously tenant_size 0 was returned, but now they will
have approximately `initdb_lsn` worth of tenant_size.

The PR also adds several new tenant size tests that describe various corner
cases of branching structure and `gc_horizon` setting.
They are currently disabled to not consume time during CI.

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
---
 pageserver/src/tenant/size.rs           | 149 ++++++++++++---
 test_runner/fixtures/neon_fixtures.py   |   9 +-
 test_runner/regress/test_tenant_size.py | 244 ++++++++++++++++++++++--
 3 files changed, 360 insertions(+), 42 deletions(-)

diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index dd4bf768a7..2181d6d4dc 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -23,7 +23,13 @@ use tracing::*;
 pub struct ModelInputs {
     updates: Vec<Update>,
     retention_period: u64,
+
+    /// Relevant lsns per timeline.
+    ///
+    /// This field is not required for deserialization purposes, which is mostly used in tests. The
+    /// LSNs explain the outcome (updates) but are not needed in size calculation.
     #[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
+    #[serde(default)]
     timeline_inputs: HashMap<TimelineId, TimelineInputs>,
 }
 
@@ -32,6 +38,8 @@ pub struct ModelInputs {
 #[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 struct TimelineInputs {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    ancestor_lsn: Lsn,
     #[serde_as(as = "serde_with::DisplayFromStr")]
     last_record: Lsn,
     #[serde_as(as = "serde_with::DisplayFromStr")]
@@ -178,21 +186,13 @@ pub(super) async fn gather_inputs(
     // our advantage with `?` error handling.
     let mut joinset = tokio::task::JoinSet::new();
 
-    let timelines = tenant
+    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
+    tenant
         .refresh_gc_info()
         .await
         .context("Failed to refresh gc_info before gathering inputs")?;
 
-    if timelines.is_empty() {
-        // All timelines are below tenant's gc_horizon; alternative would be to use
-        // Tenant::list_timelines but then those gc_info's would not be updated yet, possibly
-        // missing GcInfo::retain_lsns or having obsolete values for cutoff's.
-        return Ok(ModelInputs {
-            updates: vec![],
-            retention_period: 0,
-            timeline_inputs: HashMap::new(),
-        });
-    }
+    let timelines = tenant.list_timelines();
 
     // record the used/inserted cache keys here, to remove extras not to start leaking
     // after initial run the cache should be quite stable, but live timelines will eventually
@@ -201,13 +201,25 @@ pub(super) async fn gather_inputs(
 
     let mut updates = Vec::new();
 
-    // record the per timline values used to determine `retention_period`
+    // record the per timeline values useful to debug the model inputs, also used to track
+    // ancestor_lsn without keeping a hold of Timeline
     let mut timeline_inputs = HashMap::with_capacity(timelines.len());
 
     // used to determine the `retention_period` for the size model
     let mut max_cutoff_distance = None;
 
+    // mapping from (TimelineId, Lsn) => if this branch point has been handled already via
+    // GcInfo::retain_lsns or if it needs to have its logical_size calculated.
+    let mut referenced_branch_froms = HashMap::<(TimelineId, Lsn), bool>::new();
+
     for timeline in timelines {
+        if !timeline.is_active() {
+            anyhow::bail!(
+                "timeline {} is not active, cannot calculate tenant_size now",
+                timeline.timeline_id
+            );
+        }
+
         let last_record_lsn = timeline.get_last_record_lsn();
 
         let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = {
@@ -273,13 +285,30 @@ pub(super) async fn gather_inputs(
 
         // all timelines branch from something, because it might be impossible to pinpoint
         // which is the tenant_size_model's "default" branch.
+
+        let ancestor_lsn = timeline.get_ancestor_lsn();
+
         updates.push(Update {
-            lsn: timeline.get_ancestor_lsn(),
+            lsn: ancestor_lsn,
             command: Command::BranchFrom(timeline.get_ancestor_timeline_id()),
             timeline_id: timeline.timeline_id,
         });
 
+        if let Some(parent_timeline_id) = timeline.get_ancestor_timeline_id() {
+            // refresh_gc_info will update branchpoints and pitr_cutoff but only do it for branches
+            // which are over gc_horizon. for example, a "main" branch which never received any
+            // updates apart from initdb not have branch points recorded.
+            referenced_branch_froms
+                .entry((parent_timeline_id, timeline.get_ancestor_lsn()))
+                .or_default();
+        }
+
         for (lsn, _kind) in &interesting_lsns {
+            // mark this visited so don't need to re-process this parent
+            *referenced_branch_froms
+                .entry((timeline.timeline_id, *lsn))
+                .or_default() = true;
+
             if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) {
                 updates.push(Update {
                     lsn: *lsn,
@@ -295,22 +324,10 @@ pub(super) async fn gather_inputs(
             }
         }
 
-        // all timelines also have an end point if they have made any progress
-        if last_record_lsn > timeline.get_ancestor_lsn()
-            && !interesting_lsns
-                .iter()
-                .any(|(lsn, _)| lsn == &last_record_lsn)
-        {
-            updates.push(Update {
-                lsn: last_record_lsn,
-                command: Command::EndOfBranch,
-                timeline_id: timeline.timeline_id,
-            });
-        }
-
         timeline_inputs.insert(
             timeline.timeline_id,
             TimelineInputs {
+                ancestor_lsn,
                 last_record: last_record_lsn,
                 // this is not used above, because it might not have updated recently enough
                 latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
@@ -321,6 +338,80 @@ pub(super) async fn gather_inputs(
         );
     }
 
+    // iterate over discovered branch points and make sure we are getting logical sizes at those
+    // points.
+    for ((timeline_id, lsn), handled) in referenced_branch_froms.iter() {
+        if *handled {
+            continue;
+        }
+
+        let timeline_id = *timeline_id;
+        let lsn = *lsn;
+
+        match timeline_inputs.get(&timeline_id) {
+            Some(inputs) if inputs.ancestor_lsn == lsn => {
+                // we don't need an update at this branch point which is also point where
+                // timeline_id branch was branched from.
+                continue;
+            }
+            Some(_) => {}
+            None => {
+                // we should have this because we have iterated through all of the timelines
+                anyhow::bail!("missing timeline_input for {timeline_id}")
+            }
+        }
+
+        if let Some(size) = logical_size_cache.get(&(timeline_id, lsn)) {
+            updates.push(Update {
+                lsn,
+                timeline_id,
+                command: Command::Update(*size),
+            });
+
+            needed_cache.insert((timeline_id, lsn));
+        } else {
+            let timeline = tenant
+                .get_timeline(timeline_id, false)
+                .context("find referenced ancestor timeline")?;
+            let parallel_size_calcs = Arc::clone(limit);
+            joinset.spawn(calculate_logical_size(
+                parallel_size_calcs,
+                timeline.clone(),
+                lsn,
+            ));
+
+            if let Some(parent_id) = timeline.get_ancestor_timeline_id() {
+                // we should not find new ones because we iterated tenants all timelines
+                anyhow::ensure!(
+                    timeline_inputs.contains_key(&parent_id),
+                    "discovered new timeline {parent_id} (parent of {timeline_id})"
+                );
+            }
+        };
+    }
+
+    // finally add in EndOfBranch for all timelines where their last_record_lsn is not a branch
+    // point. this is needed by the model.
+    for (timeline_id, inputs) in timeline_inputs.iter() {
+        let lsn = inputs.last_record;
+
+        if referenced_branch_froms.contains_key(&(*timeline_id, lsn)) {
+            // this means that the (timeline_id, last_record_lsn) represents a branch point
+            // we do not want to add EndOfBranch updates for these points because it doesn't fit
+            // into the current tenant_size_model.
+            continue;
+        }
+
+        if lsn > inputs.ancestor_lsn {
+            // all timelines also have an end point if they have made any progress
+            updates.push(Update {
+                lsn,
+                command: Command::EndOfBranch,
+                timeline_id: *timeline_id,
+            });
+        }
+    }
+
     let mut have_any_error = false;
 
     while let Some(res) = joinset.join_next().await {
@@ -379,6 +470,7 @@ pub(super) async fn gather_inputs(
     // handled by the variant order in `Command`.
     //
     updates.sort_unstable();
+
     // And another sort to handle Command::BranchFrom ordering
     // in case when there are multiple branches at the same LSN.
     let sorted_updates = sort_updates_in_tree_order(updates)?;
@@ -574,7 +666,10 @@ fn updates_sort() {
 fn verify_size_for_multiple_branches() {
     // this is generated from integration test test_tenant_size_with_multiple_branches, but this way
     // it has the stable lsn's
-    let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072,"timeline_inputs":{"cd9d9409c216e64bf580904facedb01b":{"last_record":"0/18D5E40","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/18B5E40","pitr_cutoff":"0/18B5E40","next_gc_cutoff":"0/18B5E40"},"10b532a550540bc15385eac4edde416a":{"last_record":"0/1839818","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/1819818","pitr_cutoff":"0/1819818","next_gc_cutoff":"0/1819818"},"230fc9d756f7363574c0d66533564dcc":{"last_record":"0/222F438","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/220F438","pitr_cutoff":"0/220F438","next_gc_cutoff":"0/220F438"}}}"#;
+    //
+    // timelineinputs have been left out, because those explain the inputs, but don't participate
+    // in further size calculations.
+    let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072}"#;
 
     let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d6c4c32b0b..8476066691 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1206,6 +1206,9 @@ class PageserverHttpClient(requests.Session):
         return res_json
 
     def tenant_size(self, tenant_id: TenantId) -> int:
+        return self.tenant_size_and_modelinputs(tenant_id)[0]
+
+    def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]:
         """
         Returns the tenant size, together with the model inputs as the second tuple item.
         """
@@ -1216,9 +1219,9 @@ class PageserverHttpClient(requests.Session):
         assert TenantId(res["id"]) == tenant_id
         size = res["size"]
         assert type(size) == int
-        # there are additional inputs, which are the collected raw information before being fed to the tenant_size_model
-        # there are no tests for those right now.
-        return size
+        inputs = res["inputs"]
+        assert type(inputs) is dict
+        return (size, inputs)
 
     def timeline_list(
         self,
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 5747ae235f..72cfbc9dda 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,5 +1,6 @@
-from typing import List, Tuple
+from typing import Any, List, Tuple
 
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn
 from fixtures.types import Lsn
@@ -9,28 +10,247 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv):
     env = neon_simple_env
     (tenant_id, _) = env.neon_cli.create_tenant()
     http_client = env.pageserver.http_client()
-    size = http_client.tenant_size(tenant_id)
+    initial_size = http_client.tenant_size(tenant_id)
 
-    # we should never have zero, because there should be the initdb however
-    # this is questionable if we should have anything in this case, as the
-    # gc_cutoff is negative
-    assert (
-        size == 0
-    ), "initial implementation returns zero tenant_size before last_record_lsn is past gc_horizon"
+    # we should never have zero, because there should be the initdb "changes"
+    assert initial_size > 0, "initial implementation returns ~initdb tenant_size"
 
-    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+    main_branch_name = "main"
+
+    with env.postgres.create_start(
+        main_branch_name,
+        tenant_id=tenant_id,
+        config_lines=["autovacuum=off", "checkpoint_timeout=10min"],
+    ) as pg:
         with pg.cursor() as cur:
             cur.execute("SELECT 1")
             row = cur.fetchone()
             assert row is not None
             assert row[0] == 1
         size = http_client.tenant_size(tenant_id)
-        assert size == 0, "starting idle compute should not change the tenant size"
+        # we've disabled the autovacuum and checkpoint
+        # so background processes should not change the size.
+        # If this test will flake we should probably loosen the check
+        assert size == initial_size, "starting idle compute should not change the tenant size"
 
     # the size should be the same, until we increase the size over the
     # gc_horizon
-    size = http_client.tenant_size(tenant_id)
-    assert size == 0, "tenant_size should not be affected by shutdown of compute"
+    size, inputs = http_client.tenant_size_and_modelinputs(tenant_id)
+    assert size == initial_size, "tenant_size should not be affected by shutdown of compute"
+
+    expected_commands: List[Any] = [{"branch_from": None}, "end_of_branch"]
+    actual_commands: List[Any] = list(map(lambda x: x["command"], inputs["updates"]))  # type: ignore
+    assert actual_commands == expected_commands
+
+
+def test_branched_empty_timeline_size(neon_simple_env: NeonEnv):
+    """
+    Issue found in production. Because the ancestor branch was under
+    gc_horizon, the branchpoint was "dangling" and the computation could not be
+    done.
+
+    Assuming gc_horizon = 50
+    root:    I      0---10------>20
+    branch:              |-------------------I---------->150
+                                   gc_horizon
+    """
+    env = neon_simple_env
+    (tenant_id, _) = env.neon_cli.create_tenant()
+    http_client = env.pageserver.http_client()
+
+    initial_size = http_client.tenant_size(tenant_id)
+
+    first_branch_timeline_id = env.neon_cli.create_branch("first-branch", tenant_id=tenant_id)
+
+    with env.postgres.create_start("first-branch", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute(
+                "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+        wait_for_last_flush_lsn(env, pg, tenant_id, first_branch_timeline_id)
+
+    size_after_branching = http_client.tenant_size(tenant_id)
+    log.info(f"size_after_branching: {size_after_branching}")
+
+    assert size_after_branching > initial_size
+
+
+def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv):
+    """
+    More general version of test_branched_empty_timeline_size
+
+    Assuming gc_horizon = 50
+
+    root:  I 0------10
+    first: I        10
+    nth_0: I        10
+    nth_1: I        10
+    nth_n:          10------------I--------100
+    """
+    env = neon_simple_env
+    (tenant_id, _) = env.neon_cli.create_tenant()
+    http_client = env.pageserver.http_client()
+
+    initial_size = http_client.tenant_size(tenant_id)
+
+    first_branch_name = "first"
+    env.neon_cli.create_branch(first_branch_name, tenant_id=tenant_id)
+
+    size_after_branching = http_client.tenant_size(tenant_id)
+
+    # this might be flaky like test_get_tenant_size_with_multiple_branches
+    # https://github.com/neondatabase/neon/issues/2962
+    assert size_after_branching == initial_size
+
+    last_branch_name = first_branch_name
+    last_branch = None
+
+    for i in range(0, 4):
+        latest_branch_name = f"nth_{i}"
+        last_branch = env.neon_cli.create_branch(
+            latest_branch_name, ancestor_branch_name=last_branch_name, tenant_id=tenant_id
+        )
+        last_branch_name = latest_branch_name
+
+        size_after_branching = http_client.tenant_size(tenant_id)
+        assert size_after_branching == initial_size
+
+    assert last_branch is not None
+
+    with env.postgres.create_start(last_branch_name, tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute(
+                "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+        wait_for_last_flush_lsn(env, pg, tenant_id, last_branch)
+
+    size_after_writes = http_client.tenant_size(tenant_id)
+    assert size_after_writes > initial_size
+
+
+@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
+def test_branch_point_within_horizon(neon_simple_env: NeonEnv):
+    """
+    gc_horizon = 15
+
+    main:          0--I-10------>20
+    branch:              |-------------------I---------->150
+                                   gc_horizon
+    """
+
+    env = neon_simple_env
+    gc_horizon = 20_000
+    (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)})
+    http_client = env.pageserver.http_client()
+
+    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+        initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)")
+        flushed_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+
+    size_before_branching = http_client.tenant_size(tenant_id)
+
+    assert flushed_lsn.lsn_int - gc_horizon > initdb_lsn.lsn_int
+
+    branch_id = env.neon_cli.create_branch(
+        "branch", tenant_id=tenant_id, ancestor_start_lsn=flushed_lsn
+    )
+
+    with env.postgres.create_start("branch", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)")
+        wait_for_last_flush_lsn(env, pg, tenant_id, branch_id)
+
+    size_after = http_client.tenant_size(tenant_id)
+
+    assert size_before_branching < size_after
+
+
+@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
+def test_parent_within_horizon(neon_simple_env: NeonEnv):
+    """
+    gc_horizon = 5
+
+    main:          0----10----I->20
+    branch:              |-------------------I---------->150
+                                   gc_horizon
+    """
+
+    env = neon_simple_env
+    gc_horizon = 200_000
+    (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)})
+    http_client = env.pageserver.http_client()
+
+    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+        initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)")
+
+        flushed_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t00 AS SELECT i::bigint n FROM generate_series(0, 2000) s(i)")
+
+        wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+
+    size_before_branching = http_client.tenant_size(tenant_id)
+
+    assert flushed_lsn.lsn_int - gc_horizon > initdb_lsn.lsn_int
+
+    branch_id = env.neon_cli.create_branch(
+        "branch", tenant_id=tenant_id, ancestor_start_lsn=flushed_lsn
+    )
+
+    with env.postgres.create_start("branch", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
+        wait_for_last_flush_lsn(env, pg, tenant_id, branch_id)
+
+    size_after = http_client.tenant_size(tenant_id)
+
+    assert size_before_branching < size_after
+
+
+@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
+def test_only_heads_within_horizon(neon_simple_env: NeonEnv):
+    """
+    gc_horizon = small
+
+    main: 0--------10-----I>20
+    first:         |-----------------------------I>150
+    second:        |---------I>30
+    """
+
+    env = neon_simple_env
+    (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": "1024"})
+    http_client = env.pageserver.http_client()
+
+    initial_size = http_client.tenant_size(tenant_id)
+
+    first_id = env.neon_cli.create_branch("first", tenant_id=tenant_id)
+    second_id = env.neon_cli.create_branch("second", tenant_id=tenant_id)
+
+    ids = {"main": main_id, "first": first_id, "second": second_id}
+
+    latest_size = None
+
+    # gc is not expected to change the results
+
+    for branch_name, amount in [("main", 2000), ("first", 15000), ("second", 3000)]:
+        with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg:
+            with pg.cursor() as cur:
+                cur.execute(
+                    f"CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, {amount}) s(i)"
+                )
+            wait_for_last_flush_lsn(env, pg, tenant_id, ids[branch_name])
+            size_now = http_client.tenant_size(tenant_id)
+            if latest_size is not None:
+                assert size_now > latest_size
+            else:
+                assert size_now > initial_size
+
+            latest_size = size_now
 
 
 def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):

From 664a69e65ba79c8a3b6c5bd8d428de41f2243bd7 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 23 Jan 2023 10:51:09 +0200
Subject: [PATCH 32/63] Fix slru_segment_key_range function: segno was assigned
 to incorrect Key field (#3354)

---
 pageserver/src/pgdatadir_mapping.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 6ae70e3a30..cc521c5e35 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1405,15 +1405,15 @@ fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
     Key {
         field1: 0x01,
         field2,
-        field3: segno,
-        field4: 0,
+        field3: 1,
+        field4: segno,
         field5: 0,
         field6: 0,
     }..Key {
         field1: 0x01,
         field2,
-        field3: segno,
-        field4: 0,
+        field3: 1,
+        field4: segno,
         field5: 1,
         field6: 0,
     }

From edb02d3299772e9075561a3b41141d42894e3287 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 23 Jan 2023 15:08:48 +0200
Subject: [PATCH 33/63] Adding pageserver3 to staging (#3403)

---
 .github/ansible/staging.us-east-2.hosts.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index 4891875369..b46e729e32 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -29,6 +29,8 @@ storage:
           ansible_host: i-0565a8b4008aa3f40
         pageserver-2.us-east-2.aws.neon.build:
           ansible_host: i-01e31cdf7e970586a
+        pageserver-3.us-east-2.aws.neon.build:
+          ansible_host: i-0602a0291365ef7cc
 
     safekeepers:
       hosts:

From f67f0c1c11b4f5226225195b5be956c154ea4200 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 23 Jan 2023 17:12:51 +0200
Subject: [PATCH 34/63] More tenant size fixes (#3410)

Small changes, but hopefully this will help with the panic detected in
staging, for which we cannot get the debugging information right now
(end-of-branch before branch-point).
---
 libs/tenant_size_model/src/lib.rs  |  83 ++++++++++++----------
 libs/tenant_size_model/src/main.rs | 107 +++++++++++++++--------------
 pageserver/src/tenant/size.rs      |  15 +++-
 3 files changed, 114 insertions(+), 91 deletions(-)

diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs
index 92bec8aebe..b156e1be9d 100644
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -134,22 +134,25 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
         op: Cow<'static, str>,
         lsn: u64,
         size: Option<u64>,
-    ) where
+    ) -> anyhow::Result<()>
+    where
         K: std::borrow::Borrow<Q>,
         Q: std::hash::Hash + Eq + std::fmt::Debug,
     {
-        let lastseg_id = *self.branches.get(branch).unwrap();
+        let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
         let newseg_id = self.segments.len();
         let lastseg = &mut self.segments[lastseg_id];
 
         assert!(lsn > lastseg.end_lsn);
 
+        let Some(start_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
+
         let newseg = Segment {
             op,
             parent: Some(lastseg_id),
             start_lsn: lastseg.end_lsn,
             end_lsn: lsn,
-            start_size: lastseg.end_size.unwrap(),
+            start_size,
             end_size: size,
             children_after: Vec::new(),
             needed: false,
@@ -158,6 +161,8 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
 
         self.segments.push(newseg);
         *self.branches.get_mut(branch).expect("read already") = newseg_id;
+
+        Ok(())
     }
 
     /// Advances the branch with the named operation, by the relative LSN and logical size bytes.
@@ -167,21 +172,24 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
         op: Cow<'static, str>,
         lsn_bytes: u64,
         size_bytes: i64,
-    ) where
+    ) -> anyhow::Result<()>
+    where
         K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
+        Q: std::hash::Hash + Eq + std::fmt::Debug,
     {
-        let lastseg_id = *self.branches.get(branch).unwrap();
+        let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
         let newseg_id = self.segments.len();
         let lastseg = &mut self.segments[lastseg_id];
 
+        let Some(last_end_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
+
         let newseg = Segment {
             op,
             parent: Some(lastseg_id),
             start_lsn: lastseg.end_lsn,
             end_lsn: lastseg.end_lsn + lsn_bytes,
-            start_size: lastseg.end_size.unwrap(),
-            end_size: Some((lastseg.end_size.unwrap() as i64 + size_bytes) as u64),
+            start_size: last_end_size,
+            end_size: Some((last_end_size as i64 + size_bytes) as u64),
             children_after: Vec::new(),
             needed: false,
         };
@@ -189,33 +197,33 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
 
         self.segments.push(newseg);
         *self.branches.get_mut(branch).expect("read already") = newseg_id;
+        Ok(())
     }
 
-    pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
+    pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
     where
         K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
+        Q: std::hash::Hash + Eq + std::fmt::Debug,
     {
-        self.modify_branch(branch, "insert".into(), bytes, bytes as i64);
+        self.modify_branch(branch, "insert".into(), bytes, bytes as i64)
     }
 
-    pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
+    pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
     where
         K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
+        Q: std::hash::Hash + Eq + std::fmt::Debug,
     {
-        self.modify_branch(branch, "update".into(), bytes, 0i64);
+        self.modify_branch(branch, "update".into(), bytes, 0i64)
     }
 
-    pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
+    pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
     where
         K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
+        Q: std::hash::Hash + Eq + std::fmt::Debug,
     {
-        self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64));
+        self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64))
     }
 
-    /// Panics if the parent branch cannot be found.
     pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K) -> anyhow::Result<()>
     where
         K: std::borrow::Borrow<Q> + std::fmt::Debug,
@@ -236,7 +244,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
         Ok(())
     }
 
-    pub fn calculate(&mut self, retention_period: u64) -> SegmentSize {
+    pub fn calculate(&mut self, retention_period: u64) -> anyhow::Result<SegmentSize> {
         // Phase 1: Mark all the segments that need to be retained
         for (_branch, &last_seg_id) in self.branches.iter() {
             let last_seg = &self.segments[last_seg_id];
@@ -261,7 +269,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
         self.size_from_snapshot_later(0)
     }
 
-    fn size_from_wal(&self, seg_id: usize) -> SegmentSize {
+    fn size_from_wal(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
         let seg = &self.segments[seg_id];
 
         let this_size = seg.end_lsn - seg.start_lsn;
@@ -272,10 +280,10 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
         for &child_id in seg.children_after.iter() {
             // try each child both ways
             let child = &self.segments[child_id];
-            let p1 = self.size_from_wal(child_id);
+            let p1 = self.size_from_wal(child_id)?;
 
             let p = if !child.needed {
-                let p2 = self.size_from_snapshot_later(child_id);
+                let p2 = self.size_from_snapshot_later(child_id)?;
                 if p1.total() < p2.total() {
                     p1
                 } else {
@@ -286,15 +294,15 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
             };
             children.push(p);
         }
-        SegmentSize {
+        Ok(SegmentSize {
             seg_id,
             method: if seg.needed { WalNeeded } else { Wal },
             this_size,
             children,
-        }
+        })
     }
 
-    fn size_from_snapshot_later(&self, seg_id: usize) -> SegmentSize {
+    fn size_from_snapshot_later(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
         // If this is needed, then it's time to do the snapshot and continue
         // with wal method.
         let seg = &self.segments[seg_id];
@@ -305,10 +313,10 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
             for &child_id in seg.children_after.iter() {
                 // try each child both ways
                 let child = &self.segments[child_id];
-                let p1 = self.size_from_wal(child_id);
+                let p1 = self.size_from_wal(child_id)?;
 
                 let p = if !child.needed {
-                    let p2 = self.size_from_snapshot_later(child_id);
+                    let p2 = self.size_from_snapshot_later(child_id)?;
                     if p1.total() < p2.total() {
                         p1
                     } else {
@@ -319,12 +327,12 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
                 };
                 children.push(p);
             }
-            SegmentSize {
+            Ok(SegmentSize {
                 seg_id,
                 method: WalNeeded,
                 this_size: seg.start_size,
                 children,
-            }
+            })
         } else {
             // If any of the direct children are "needed", need to be able to reconstruct here
             let mut children_needed = false;
@@ -339,7 +347,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
             let method1 = if !children_needed {
                 let mut children = Vec::new();
                 for child in seg.children_after.iter() {
-                    children.push(self.size_from_snapshot_later(*child));
+                    children.push(self.size_from_snapshot_later(*child)?);
                 }
                 Some(SegmentSize {
                     seg_id,
@@ -355,20 +363,25 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
             let method2 = if children_needed || seg.children_after.len() >= 2 {
                 let mut children = Vec::new();
                 for child in seg.children_after.iter() {
-                    children.push(self.size_from_wal(*child));
+                    children.push(self.size_from_wal(*child)?);
                 }
+                let Some(this_size) = seg.end_size else { anyhow::bail!("no end_size at junction {seg_id}") };
                 Some(SegmentSize {
                     seg_id,
                     method: SnapshotAfter,
-                    this_size: seg.end_size.unwrap(),
+                    this_size,
                     children,
                 })
             } else {
                 None
             };
 
-            match (method1, method2) {
-                (None, None) => panic!(),
+            Ok(match (method1, method2) {
+                (None, None) => anyhow::bail!(
+                    "neither method was applicable: children_after={}, children_needed={}",
+                    seg.children_after.len(),
+                    children_needed
+                ),
                 (Some(method), None) => method,
                 (None, Some(method)) => method,
                 (Some(method1), Some(method2)) => {
@@ -378,7 +391,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
                         method2
                     }
                 }
-            }
+            })
         }
     }
 
diff --git a/libs/tenant_size_model/src/main.rs b/libs/tenant_size_model/src/main.rs
index 9378a98a09..e32dd055f4 100644
--- a/libs/tenant_size_model/src/main.rs
+++ b/libs/tenant_size_model/src/main.rs
@@ -7,118 +7,118 @@
 use tenant_size_model::{Segment, SegmentSize, Storage};
 
 // Main branch only. Some updates on it.
-fn scenario_1() -> (Vec<Segment>, SegmentSize) {
+fn scenario_1() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
     // Create main branch
     let mut storage = Storage::new("main");
 
     // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
+    storage.insert("main", 5_000)?;
 
     // Stream of updates
     for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
     }
 
-    let size = storage.calculate(1000);
+    let size = storage.calculate(1000)?;
 
-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }
 
 // Main branch only. Some updates on it.
-fn scenario_2() -> (Vec<Segment>, SegmentSize) {
+fn scenario_2() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
     // Create main branch
     let mut storage = Storage::new("main");
 
     // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
+    storage.insert("main", 5_000)?;
 
     // Stream of updates
     for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
     }
 
     // Branch
-    storage.branch("main", "child").unwrap();
-    storage.update("child", 1_000);
+    storage.branch("main", "child")?;
+    storage.update("child", 1_000)?;
 
     // More updates on parent
-    storage.update("main", 1_000);
+    storage.update("main", 1_000)?;
 
-    let size = storage.calculate(1000);
+    let size = storage.calculate(1000)?;
 
-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }
 
 // Like 2, but more updates on main
-fn scenario_3() -> (Vec<Segment>, SegmentSize) {
+fn scenario_3() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
     // Create main branch
     let mut storage = Storage::new("main");
 
     // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
+    storage.insert("main", 5_000)?;
 
     // Stream of updates
     for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
     }
 
     // Branch
-    storage.branch("main", "child").unwrap();
-    storage.update("child", 1_000);
+    storage.branch("main", "child")?;
+    storage.update("child", 1_000)?;
 
     // More updates on parent
     for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
     }
 
-    let size = storage.calculate(1000);
+    let size = storage.calculate(1000)?;
 
-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }
 
 // Diverged branches
-fn scenario_4() -> (Vec<Segment>, SegmentSize) {
+fn scenario_4() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
     // Create main branch
     let mut storage = Storage::new("main");
 
     // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
+    storage.insert("main", 5_000)?;
 
     // Stream of updates
     for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
     }
 
     // Branch
-    storage.branch("main", "child").unwrap();
-    storage.update("child", 1_000);
+    storage.branch("main", "child")?;
+    storage.update("child", 1_000)?;
 
     // More updates on parent
     for _ in 0..8 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
     }
 
-    let size = storage.calculate(1000);
+    let size = storage.calculate(1000)?;
 
-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }
 
-fn scenario_5() -> (Vec<Segment>, SegmentSize) {
+fn scenario_5() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
     let mut storage = Storage::new("a");
-    storage.insert("a", 5000);
-    storage.branch("a", "b").unwrap();
-    storage.update("b", 4000);
-    storage.update("a", 2000);
-    storage.branch("a", "c").unwrap();
-    storage.insert("c", 4000);
-    storage.insert("a", 2000);
+    storage.insert("a", 5000)?;
+    storage.branch("a", "b")?;
+    storage.update("b", 4000)?;
+    storage.update("a", 2000)?;
+    storage.branch("a", "c")?;
+    storage.insert("c", 4000)?;
+    storage.insert("a", 2000)?;
 
-    let size = storage.calculate(5000);
+    let size = storage.calculate(5000)?;
 
-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }
 
-fn scenario_6() -> (Vec<Segment>, SegmentSize) {
+fn scenario_6() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
     use std::borrow::Cow;
 
     const NO_OP: Cow<'static, str> = Cow::Borrowed("");
@@ -133,18 +133,18 @@ fn scenario_6() -> (Vec<Segment>, SegmentSize) {
 
     let mut storage = Storage::new(None);
 
-    storage.branch(&None, branches[0]).unwrap(); // at 0
-    storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128); // at 108951064
-    storage.branch(&branches[0], branches[1]).unwrap(); // at 108951064
-    storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392); // at 124511472
-    storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904); // at 283415424
-    storage.branch(&branches[0], branches[2]).unwrap(); // at 283415424
-    storage.modify_branch(&branches[2], NO_OP, 15906192, 8192); // at 299321616
-    storage.modify_branch(&branches[0], NO_OP, 18909976, 32768); // at 302325400
+    storage.branch(&None, branches[0])?; // at 0
+    storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128)?; // at 108951064
+    storage.branch(&branches[0], branches[1])?; // at 108951064
+    storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392)?; // at 124511472
+    storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904)?; // at 283415424
+    storage.branch(&branches[0], branches[2])?; // at 283415424
+    storage.modify_branch(&branches[2], NO_OP, 15906192, 8192)?; // at 299321616
+    storage.modify_branch(&branches[0], NO_OP, 18909976, 32768)?; // at 302325400
 
-    let size = storage.calculate(100_000);
+    let size = storage.calculate(100_000)?;
 
-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }
 
 fn main() {
@@ -163,7 +163,8 @@ fn main() {
             eprintln!("invalid scenario {}", other);
             std::process::exit(1);
         }
-    };
+    }
+    .unwrap();
 
     graphviz_tree(&segments, &size);
 }
@@ -251,7 +252,7 @@ fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
 
 #[test]
 fn scenarios_return_same_size() {
-    type ScenarioFn = fn() -> (Vec<Segment>, SegmentSize);
+    type ScenarioFn = fn() -> anyhow::Result<(Vec<Segment>, SegmentSize)>;
     let truths: &[(u32, ScenarioFn, _)] = &[
         (line!(), scenario_1, 8000),
         (line!(), scenario_2, 9000),
@@ -262,7 +263,7 @@ fn scenarios_return_same_size() {
     ];
 
     for (line, scenario, expected) in truths {
-        let (_, size) = scenario();
+        let (_, size) = scenario().unwrap();
         assert_eq!(*expected, size.total_children(), "scenario on line {line}");
     }
 }
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 2181d6d4dc..61cb32fc76 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -194,6 +194,15 @@ pub(super) async fn gather_inputs(
 
     let timelines = tenant.list_timelines();
 
+    if timelines.is_empty() {
+        // perhaps the tenant has just been created, and as such doesn't have any data yet
+        return Ok(ModelInputs {
+            updates: vec![],
+            retention_period: 0,
+            timeline_inputs: HashMap::default(),
+        });
+    }
+
     // record the used/inserted cache keys here, to remove extras not to start leaking
     // after initial run the cache should be quite stable, but live timelines will eventually
     // require new lsns to be inspected.
@@ -505,10 +514,10 @@ impl ModelInputs {
             let Lsn(now) = *lsn;
             match op {
                 Command::Update(sz) => {
-                    storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz));
+                    storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz))?;
                 }
                 Command::EndOfBranch => {
-                    storage.insert_point(&Some(*timeline_id), "".into(), now, None);
+                    storage.insert_point(&Some(*timeline_id), "".into(), now, None)?;
                 }
                 Command::BranchFrom(parent) => {
                     // This branch command may fail if it cannot find a parent to branch from.
@@ -517,7 +526,7 @@ impl ModelInputs {
             }
         }
 
-        Ok(storage.calculate(self.retention_period).total_children())
+        Ok(storage.calculate(self.retention_period)?.total_children())
     }
 }
 

From eb36403e71210b1be7e2482fc385b8da8c149d5f Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Tue, 31 Jan 2023 14:06:35 +0100
Subject: [PATCH 35/63] Release 2023 01 31 (#3497)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Kirill Bulatov <kirill@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
Co-authored-by: bojanserafimov <bojan.serafimov7@gmail.com>
Co-authored-by: Christian Schwarz <christian@neon.tech>
Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Konstantin Knizhnik <knizhnik@garret.ru>
Co-authored-by: Shany Pozin <shany@neon.tech>
Co-authored-by: Sergey Melnikov <sergey@neon.tech>
Co-authored-by: Dmitry Rodionov <dmitry@neon.tech>
Co-authored-by: Rory de Zoete <33318916+zoete@users.noreply.github.com>
Co-authored-by: Rory de Zoete <rdezoete@Rorys-Mac-Studio.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@RorysMacStudio.fritz.box>
Co-authored-by: Lassi Pölönen <lassi.polonen@iki.fi>
---
 .../actions/run-python-test-set/action.yml    |   4 +-
 .github/ansible/deploy.yaml                   |   6 +-
 .../dev-us-east-2-beta.neon-proxy-link.yaml   |   1 +
 ...prod-us-east-2-delta.neon-proxy-link.yaml} |  24 +-
 ...us-west-2-eta.neon-proxy-scram-legacy.yaml |  61 ++
 .github/workflows/build_and_test.yml          | 572 +++---------
 .github/workflows/deploy-dev.yml              | 179 ++++
 .github/workflows/deploy-prod.yml             | 277 ++++++
 .github/workflows/release.yml                 |  33 +
 Cargo.lock                                    | 328 ++++---
 Cargo.toml                                    |  10 +-
 ...ompute-node-v14 => Dockerfile.compute-node |  34 +-
 Dockerfile.compute-node-v15                   | 220 -----
 compute_tools/Cargo.toml                      |   3 +
 compute_tools/src/bin/compute_ctl.rs          |  32 +-
 compute_tools/src/http/api.rs                 |  27 +-
 compute_tools/src/logger.rs                   |  24 +-
 compute_tools/src/params.rs                   |   8 +-
 compute_tools/src/spec.rs                     |  23 +-
 libs/metrics/src/lib.rs                       |   1 +
 libs/pageserver_api/src/models.rs             |  37 +-
 libs/tracing-utils/Cargo.toml                 |  17 +
 libs/tracing-utils/src/http.rs                |  96 ++
 libs/tracing-utils/src/lib.rs                 | 168 ++++
 libs/utils/Cargo.toml                         |   1 +
 libs/utils/src/http/error.rs                  |  17 +-
 libs/utils/src/logging.rs                     |   2 +-
 pageserver/Cargo.toml                         |   3 +-
 pageserver/benches/bench_layer_map.rs         | 224 ++---
 pageserver/src/basebackup.rs                  |  50 +-
 pageserver/src/bin/pageserver.rs              |  57 +-
 pageserver/src/broker_client.rs               |  48 +
 pageserver/src/config.rs                      |  28 +
 pageserver/src/consumption_metrics.rs         |  24 +-
 pageserver/src/context.rs                     | 199 ++++
 pageserver/src/http/openapi_spec.yml          |  10 +-
 pageserver/src/http/routes.rs                 | 170 ++--
 pageserver/src/import_datadir.rs              |  52 +-
 pageserver/src/lib.rs                         |   3 +-
 pageserver/src/metrics.rs                     | 166 +++-
 pageserver/src/page_service.rs                | 223 +++--
 pageserver/src/pgdatadir_mapping.rs           | 169 ++--
 pageserver/src/repository.rs                  |  11 +
 pageserver/src/task_mgr.rs                    |  45 +-
 pageserver/src/tenant.rs                      | 576 ++++++------
 pageserver/src/tenant/config.rs               |   7 +-
 pageserver/src/tenant/layer_map.rs            | 877 +++++++++---------
 .../layer_map/historic_layer_coverage.rs      | 583 ++++++++++++
 .../src/tenant/layer_map/layer_coverage.rs    | 154 +++
 pageserver/src/tenant/mgr.rs                  | 236 +++--
 .../src/tenant/remote_timeline_client.rs      |  25 +-
 pageserver/src/tenant/size.rs                 |  16 +-
 pageserver/src/tenant/storage_layer.rs        |  47 +
 pageserver/src/tenant/tasks.rs                |  13 +-
 pageserver/src/tenant/timeline.rs             | 327 +++++--
 .../src/{ => tenant/timeline}/walreceiver.rs  |  44 -
 .../walreceiver/connection_manager.rs         |  55 +-
 .../walreceiver/walreceiver_connection.rs     |  29 +-
 pageserver/src/walingest.rs                   | 418 ++++++---
 pageserver/src/walredo.rs                     | 329 +++++--
 poetry.lock                                   | 247 ++++-
 proxy/src/main.rs                             |   4 +-
 pyproject.toml                                |   1 +
 safekeeper/src/bin/safekeeper.rs              |   7 +-
 scripts/force_layer_download.py               | 324 +++++++
 storage_broker/src/bin/storage_broker.rs      |   4 +-
 test_runner/fixtures/metrics.py               |  12 +-
 test_runner/regress/test_tenant_conf.py       |  55 +-
 test_runner/regress/test_tenant_detach.py     |  46 +-
 test_runner/regress/test_tenants.py           |  56 +-
 workspace_hack/Cargo.toml                     |   8 +-
 71 files changed, 5779 insertions(+), 2408 deletions(-)
 rename .github/helm-values/{production.proxy.yaml => prod-us-east-2-delta.neon-proxy-link.yaml} (80%)
 create mode 100644 .github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
 create mode 100644 .github/workflows/deploy-dev.yml
 create mode 100644 .github/workflows/deploy-prod.yml
 create mode 100644 .github/workflows/release.yml
 rename Dockerfile.compute-node-v14 => Dockerfile.compute-node (86%)
 delete mode 100644 Dockerfile.compute-node-v15
 create mode 100644 libs/tracing-utils/Cargo.toml
 create mode 100644 libs/tracing-utils/src/http.rs
 create mode 100644 libs/tracing-utils/src/lib.rs
 create mode 100644 pageserver/src/broker_client.rs
 create mode 100644 pageserver/src/context.rs
 create mode 100644 pageserver/src/tenant/layer_map/historic_layer_coverage.rs
 create mode 100644 pageserver/src/tenant/layer_map/layer_coverage.rs
 rename pageserver/src/{ => tenant/timeline}/walreceiver.rs (83%)
 rename pageserver/src/{ => tenant/timeline}/walreceiver/connection_manager.rs (96%)
 rename pageserver/src/{ => tenant/timeline}/walreceiver/walreceiver_connection.rs (94%)
 create mode 100644 scripts/force_layer_download.py

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 990c7e25a9..29b04a3478 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -123,8 +123,8 @@ runs:
           exit 1
         fi
         if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
-          # -n4 uses four processes to run tests via pytest-xdist
-          EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
+          # -n16 uses sixteen processes to run tests via pytest-xdist
+          EXTRA_PARAMS="-n16 $EXTRA_PARAMS"
 
           # --dist=loadgroup points tests marked with @pytest.mark.xdist_group
           # to the same worker to make @pytest.mark.order work with xdist
diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml
index 4adc685684..a17dc9c78f 100644
--- a/.github/ansible/deploy.yaml
+++ b/.github/ansible/deploy.yaml
@@ -117,7 +117,8 @@
       shell:
         cmd: |
           INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/pageservers
       tags:
       - pageserver
 
@@ -186,6 +187,7 @@
       shell:
         cmd: |
           INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/safekeepers
       tags:
       - safekeeper
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
index cb062f705d..157ae66ed1 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -8,6 +8,7 @@ settings:
   authBackend: "link"
   authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
   uri: "https://console.stage.neon.tech/psql_session/"
+  domain: "pg.neon.build"
   sentryEnvironment: "staging"
   metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
diff --git a/.github/helm-values/production.proxy.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
similarity index 80%
rename from .github/helm-values/production.proxy.yaml
rename to .github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
index dbaf3cd096..eff24302bb 100644
--- a/.github/helm-values/production.proxy.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
@@ -1,37 +1,37 @@
+# Helm chart values for neon-proxy-link.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
 settings:
   authBackend: "link"
   authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
   uri: "https://console.neon.tech/psql_session/"
+  domain: "pg.neon.tech"
   sentryEnvironment: "production"
 
 # -- Additional labels for zenith-proxy pods
 podLabels:
   zenith_service: proxy
   zenith_env: production
-  zenith_region: us-west-2
-  zenith_region_slug: oregon
+  zenith_region: us-east-2
+  zenith_region_slug: us-east-2
 
 service:
+  type: LoadBalancer
   annotations:
     service.beta.kubernetes.io/aws-load-balancer-type: external
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internal
-    external-dns.alpha.kubernetes.io/hostname: proxy-release.local
-  type: LoadBalancer
+    external-dns.alpha.kubernetes.io/hostname: neon-proxy-link-mgmt.delta.us-east-2.aws.neon.tech
 
 exposedService:
   annotations:
     service.beta.kubernetes.io/aws-load-balancer-type: external
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: connect.neon.tech,pg.neon.tech
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
+    external-dns.alpha.kubernetes.io/hostname: neon-proxy-link.delta.us-east-2.aws.neon.tech
 
 extraManifests:
   - apiVersion: operator.victoriametrics.com/v1beta1
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
new file mode 100644
index 0000000000..3a5cde4b01
--- /dev/null
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
@@ -0,0 +1,61 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.cloud.neon.tech"
+  sentryEnvironment: "production"
+  wssPort: 8443
+  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionInterval: "10min"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: prod
+  zenith_region: us-west-2
+  zenith_region_slug: us-west-2
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.eta.us-west-2.aws.neon.tech
+  httpsPort: 443
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 918e568e27..89e12360f9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1,4 +1,4 @@
-name: Test and Deploy
+name: Build and Test
 
 on:
   push:
@@ -19,10 +19,12 @@ concurrency:
 env:
   RUST_BACKTRACE: 1
   COPT: '-Werror'
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
 jobs:
   tag:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
     outputs:
       build-tag: ${{steps.build-tag.outputs.tag}}
@@ -50,7 +52,7 @@ jobs:
         id: build-tag
 
   check-codestyle-python:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cloud:pinned
       options: --init
@@ -85,7 +87,7 @@ jobs:
         run: poetry run mypy .
 
   check-codestyle-rust:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -97,16 +99,16 @@ jobs:
           submodules: true
           fetch-depth: 1
 
-      - name: Restore cargo deps cache
-        id: cache_cargo
-        uses: actions/cache@v3
-        with:
-          path: |
-            ~/.cargo/registry/
-            !~/.cargo/registry/src
-            ~/.cargo/git/
-            target/
-          key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#      Disabled for now
+#      - name: Restore cargo deps cache
+#        id: cache_cargo
+#        uses: actions/cache@v3
+#        with:
+#          path: |
+#            !~/.cargo/registry/src
+#            ~/.cargo/git/
+#            target/
+#          key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
 
       # Some of our rust modules use FFI and need those to be checked
       - name: Get postgres headers
@@ -133,7 +135,7 @@ jobs:
         run: cargo deny check
 
   build-neon:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -141,7 +143,6 @@ jobs:
       fail-fast: false
       matrix:
         build_type: [ debug, release ]
-
     env:
       BUILD_TYPE: ${{ matrix.build_type }}
       GIT_VERSION: ${{ github.sha }}
@@ -194,24 +195,26 @@ jobs:
           echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
           echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
           echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
+          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
 
+      # Disabled for now
       # Don't include the ~/.cargo/registry/src directory. It contains just
       # uncompressed versions of the crates in ~/.cargo/registry/cache
       # directory, and it's faster to let 'cargo' to rebuild it from the
       # compressed crates.
-      - name: Cache cargo deps
-        id: cache_cargo
-        uses: actions/cache@v3
-        with:
-          path: |
-            ~/.cargo/registry/
-            !~/.cargo/registry/src
-            ~/.cargo/git/
-            target/
-          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
-          key: |
-            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
-            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
+#      - name: Cache cargo deps
+#        id: cache_cargo
+#        uses: actions/cache@v3
+#        with:
+#          path: |
+#            ~/.cargo/registry/
+#            !~/.cargo/registry/src
+#            ~/.cargo/git/
+#            target/
+#          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
+#          key: |
+#            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
 
       - name: Cache postgres v14 build
         id: cache_pg_14
@@ -301,7 +304,7 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   regress-tests:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -334,7 +337,7 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   benchmarks:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -365,7 +368,7 @@ jobs:
       # while coverage is currently collected for the debug ones
 
   merge-allure-report:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -402,7 +405,7 @@ jobs:
           DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
 
   coverage-report:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -418,16 +421,17 @@ jobs:
           submodules: true
           fetch-depth: 1
 
-      - name: Restore cargo deps cache
-        id: cache_cargo
-        uses: actions/cache@v3
-        with:
-          path: |
-            ~/.cargo/registry/
-            !~/.cargo/registry/src
-            ~/.cargo/git/
-            target/
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#      Disabled for now
+#      - name: Restore cargo deps cache
+#        id: cache_cargo
+#        uses: actions/cache@v3
+#        with:
+#          path: |
+#            ~/.cargo/registry/
+#            !~/.cargo/registry/src
+#            ~/.cargo/git/
+#            target/
+#          key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
 
       - name: Get Neon artifact
         uses: ./.github/actions/download
@@ -477,7 +481,7 @@ jobs:
             }"
 
   trigger-e2e-tests:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
       options: --init
@@ -522,9 +526,10 @@ jobs:
             }"
 
   neon-image:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     needs: [ tag ]
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    # https://github.com/GoogleContainerTools/kaniko/issues/2005
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
     defaults:
       run:
         shell: sh -eu {0}
@@ -540,12 +545,16 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build neon
-        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+
+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   compute-tools-image:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     needs: [ tag ]
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
     defaults:
       run:
         shell: sh -eu {0}
@@ -558,11 +567,14 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build compute tools
-        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   compute-node-image:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    runs-on: [ self-hosted, gen3, large ]
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
     needs: [ tag ]
     strategy:
       fail-fast: false
@@ -583,10 +595,13 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build compute node with extensions
-        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-${{ matrix.version }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   vm-compute-node-image:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     needs: [ tag, compute-node-image ]
     strategy:
       fail-fast: false
@@ -631,7 +646,7 @@ jobs:
 
   test-images:
     needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
 
     steps:
       - name: Checkout
@@ -673,20 +688,39 @@ jobs:
           docker compose -f ./docker-compose/docker-compose.yml down
 
   promote-images:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     needs: [ tag, test-images, vm-compute-node-image ]
+    container: golang:1.19-bullseye
     if: github.event_name != 'workflow_dispatch'
-    container: amazon/aws-cli
-    strategy:
-      fail-fast: false
-      matrix:
-        name: [ neon, compute-node-v14, vm-compute-node-v14, compute-node-v15, vm-compute-node-v15, compute-tools]
 
     steps:
-      - name: Promote image to latest
+      - name: Install Crane & ECR helper
+        if: |
+          (github.ref_name == 'main' || github.ref_name == 'release') &&
+          github.event_name != 'workflow_dispatch'
         run: |
-          export MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=${{needs.tag.outputs.build-tag}} --query 'images[].imageManifest' --output text)
-          aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"
+          go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Add latest tag to images
+        if: |
+          (github.ref_name == 'main' || github.ref_name == 'release') &&
+          github.event_name != 'workflow_dispatch'
+        run: |
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   push-docker-hub:
     runs-on: [ self-hosted, dev, x64 ]
@@ -776,114 +810,11 @@ jobs:
           crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
 
-  calculate-deploy-targets:
-    runs-on: [ self-hosted, dev, x64 ]
-    if: |
-      github.ref_name == 'release' &&
-      github.event_name != 'workflow_dispatch'
-    outputs:
-      matrix-include: ${{ steps.set-matrix.outputs.include }}
-    steps:
-      - id: set-matrix
-        run: |
-          if [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
-            echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
-          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to 'release'"
-            exit 1
-          fi
-
-  deploy:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
-    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
-    if: |
-      github.ref_name == 'release' &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
-    environment:
-      name: prod-old
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Redeploy
-        run: |
-          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          cd "$(pwd)/.github/ansible"
-
-          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            ./get_binaries.sh
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            RELEASE=true ./get_binaries.sh
-          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
-            exit 1
-          fi
-
-          eval $(ssh-agent)
-          echo "${{ secrets.TELEPORT_SSH_KEY }}"  | tr -d '\n'| base64 --decode >ssh-key
-          echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
-          chmod 0600 ssh-key
-          ssh-add ssh-key
-          rm -f ssh-key ssh-key-cert.pub
-          ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
-          ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
-
-  deploy-new:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
-    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'main') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        target_region: [ eu-west-1, us-east-2 ]
-    environment:
-      name: dev-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Redeploy
-        run: |
-          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          cd "$(pwd)/.github/ansible"
-          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            ./get_binaries.sh
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            RELEASE=true ./get_binaries.sh
-          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
-            exit 1
-          fi
-          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   deploy-pr-test-new:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
     # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
     # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
@@ -915,311 +846,40 @@ jobs:
           ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
           rm -f neon_install.tar.gz .neon_current_version
 
-  deploy-prod-new:
-    runs-on: prod
-    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
-    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
+      - name: Cleanup ansible folder
+        run: rm -rf ~/.ansible
+
+  deploy:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
     needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'release') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
-    environment:
-      name: prod-${{ matrix.target_region }}
+    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
     steps:
       - name: Checkout
         uses: actions/checkout@v3
         with:
-          submodules: true
+          submodules: false
           fetch-depth: 0
 
-      - name: Redeploy
+      - name: Trigger deploy workflow
+        env:
+          GH_TOKEN: ${{ github.token }}
         run: |
-          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          cd "$(pwd)/.github/ansible"
-
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            ./get_binaries.sh
+            gh workflow run deploy-dev.yml --ref main -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            RELEASE=true ./get_binaries.sh
+            gh workflow run deploy-prod.yml --ref release -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}}
           else
             echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
             exit 1
           fi
 
-          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
-
-  deploy-proxy:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
-    if: |
-      github.ref_name == 'release' &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
-    environment:
-      name: prod-old
-    env:
-      KUBECONFIG: .kubeconfig
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Add curl
-        run: apt update && apt install curl -y
-
-      - name: Store kubeconfig file
-        run: |
-          echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
-          chmod 0600 ${KUBECONFIG}
-
-      - name: Setup helm v3
-        run: |
-          curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-
-      - name: Re-deploy proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}.yaml       --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-  deploy-storage-broker:
-    name: deploy storage broker on old staging and old prod
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
-    if: |
-      github.ref_name == 'release' &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
-    environment:
-      name: prod-old
-    env:
-      KUBECONFIG: .kubeconfig
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Add curl
-        run: apt update && apt install curl -y
-
-      - name: Store kubeconfig file
-        run: |
-          echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
-          chmod 0600 ${KUBECONFIG}
-
-      - name: Setup helm v3
-        run: |
-          curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-
-      - name: Deploy storage-broker
-        run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install --atomic -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
-  deploy-proxy-new:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'main') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: dev-us-east-2-beta
-            deploy_link_proxy: true
-            deploy_legacy_scram_proxy: true
-          - target_region:  eu-west-1
-            target_cluster: dev-eu-west-1-zeta
-            deploy_link_proxy: false
-            deploy_legacy_scram_proxy: false
-    environment:
-      name: dev-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Re-deploy scram proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-      - name: Re-deploy link proxy
-        if: matrix.deploy_link_proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-      - name: Re-deploy legacy scram proxy
-        if: matrix.deploy_legacy_scram_proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-  deploy-storage-broker-dev-new:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'main') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: dev-us-east-2-beta
-          - target_region:  eu-west-1
-            target_cluster: dev-eu-west-1-zeta
-    environment:
-      name: dev-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Deploy storage-broker
-        run:
-          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
-  deploy-proxy-prod-new:
-    runs-on: prod
-    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'release') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: prod-us-east-2-delta
-          - target_region:  us-west-2
-            target_cluster: prod-us-west-2-eta
-          - target_region: eu-central-1
-            target_cluster: prod-eu-central-1-gamma
-          - target_region: ap-southeast-1
-            target_cluster: prod-ap-southeast-1-epsilon
-    environment:
-      name: prod-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Re-deploy proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-  deploy-storage-broker-prod-new:
-    runs-on: prod
-    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'release') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: prod-us-east-2-delta
-          - target_region:  us-west-2
-            target_cluster: prod-us-west-2-eta
-          - target_region: eu-central-1
-            target_cluster: prod-eu-central-1-gamma
-          - target_region: ap-southeast-1
-            target_cluster: prod-ap-southeast-1-epsilon
-    environment:
-      name: prod-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Deploy storage-broker
-        run:
-          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
   promote-compatibility-data:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
-    needs: [ deploy, deploy-proxy ]
+    needs: [ push-docker-hub, tag, regress-tests ]
     if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
     steps:
       - name: Promote compatibility snapshot for the release
diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml
new file mode 100644
index 0000000000..409517bf63
--- /dev/null
+++ b/.github/workflows/deploy-dev.yml
@@ -0,0 +1,179 @@
+name: Neon Deploy dev
+
+on:
+  workflow_dispatch:
+    inputs:
+      dockerTag:
+        description: 'Docker tag to deploy'
+        required: true
+        type: string
+      branch:
+        description: 'Branch or commit used for deploy scripts and configs'
+        required: true
+        type: string
+        default: 'main'
+      deployStorage:
+        description: 'Deploy storage'
+        required: true
+        type: boolean
+        default: true
+      deployProxy:
+        description: 'Deploy proxy'
+        required: true
+        type: boolean
+        default: true
+      deployStorageBroker:
+        description: 'Deploy storage-broker'
+        required: true
+        type: boolean
+        default: true
+
+env:
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+concurrency:
+  group: deploy-dev
+  cancel-in-progress: false
+
+jobs:
+  deploy-storage-new:
+    runs-on: [ self-hosted, gen3, small ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+      options: --user root --privileged
+    if: inputs.deployStorage
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        target_region: [ eu-west-1, us-east-2 ]
+    environment:
+      name: dev-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{ inputs.dockerTag }}
+          cd "$(pwd)/.github/ansible"
+
+          ./get_binaries.sh
+
+          ansible-galaxy collection install sivel.toiletwater
+          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          rm -f neon_install.tar.gz .neon_current_version
+
+      - name: Cleanup ansible folder
+        run: rm -rf ~/.ansible
+
+  deploy-proxy-new:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployProxy
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: dev-us-east-2-beta
+            deploy_link_proxy: true
+            deploy_legacy_scram_proxy: true
+          - target_region:  eu-west-1
+            target_cluster: dev-eu-west-1-zeta
+            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: false
+    environment:
+      name: dev-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+  
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1-node16
+        with:
+          role-to-assume: arn:aws:iam::369495373322:role/github-runner
+          aws-region: eu-central-1
+          role-skip-session-tagging: true
+          role-duration-seconds: 1800
+  
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+  
+      - name: Re-deploy scram proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+  
+      - name: Re-deploy link proxy
+        if: matrix.deploy_link_proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+  
+      - name: Re-deploy legacy scram proxy
+        if: matrix.deploy_legacy_scram_proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+  
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
+  
+  deploy-storage-broker-new:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployStorageBroker
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: dev-us-east-2-beta
+          - target_region:  eu-west-1
+            target_cluster: dev-eu-west-1-zeta
+    environment:
+      name: dev-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+  
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1-node16
+        with:
+          role-to-assume: arn:aws:iam::369495373322:role/github-runner
+          aws-region: eu-central-1
+          role-skip-session-tagging: true
+          role-duration-seconds: 1800
+  
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+  
+      - name: Deploy storage-broker
+        run:
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+  
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml
new file mode 100644
index 0000000000..e1954b5540
--- /dev/null
+++ b/.github/workflows/deploy-prod.yml
@@ -0,0 +1,277 @@
+name: Neon Deploy prod
+
+on:
+  workflow_dispatch:
+    inputs:
+      dockerTag:
+        description: 'Docker tag to deploy'
+        required: true
+        type: string
+      branch:
+        description: 'Branch or commit used for deploy scripts and configs'
+        required: true
+        type: string
+        default: 'main'
+      deployStorage:
+        description: 'Deploy storage'
+        required: true
+        type: boolean
+        default: true
+      deployProxy:
+        description: 'Deploy proxy'
+        required: true
+        type: boolean
+        default: true
+      deployStorageBroker:
+        description: 'Deploy storage-broker'
+        required: true
+        type: boolean
+        default: true
+
+concurrency:
+  group: deploy-prod
+  cancel-in-progress: false
+
+jobs:
+  deploy-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    if: inputs.deployStorage
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
+    environment:
+      name: prod-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{ inputs.dockerTag }}
+          cd "$(pwd)/.github/ansible"
+
+          ./get_binaries.sh
+
+          ansible-galaxy collection install sivel.toiletwater
+          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          rm -f neon_install.tar.gz .neon_current_version
+
+  deploy-proxy-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    if: inputs.deployProxy
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: prod-us-east-2-delta
+            deploy_link_proxy: true
+            deploy_legacy_scram_proxy: false
+          - target_region:  us-west-2
+            target_cluster: prod-us-west-2-eta
+            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: true
+          - target_region: eu-central-1
+            target_cluster: prod-eu-central-1-gamma
+            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: false
+          - target_region: ap-southeast-1
+            target_cluster: prod-ap-southeast-1-epsilon
+            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: false
+    environment:
+      name: prod-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+
+      - name: Re-deploy scram proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+      - name: Re-deploy link proxy
+        if: matrix.deploy_link_proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+      - name: Re-deploy legacy scram proxy
+        if: matrix.deploy_legacy_scram_proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+  deploy-storage-broker-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    if: inputs.deployStorageBroker
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: prod-us-east-2-delta
+          - target_region:  us-west-2
+            target_cluster: prod-us-west-2-eta
+          - target_region: eu-central-1
+            target_cluster: prod-eu-central-1-gamma
+          - target_region: ap-southeast-1
+            target_cluster: prod-ap-southeast-1-epsilon
+    environment:
+      name: prod-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+
+      - name: Deploy storage-broker
+        run:
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+
+  # Deploy to old account below          
+
+  deploy:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployStorage
+    defaults:
+      run:
+        shell: bash
+    environment:
+      name: prod-old
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{ inputs.dockerTag }}
+          cd "$(pwd)/.github/ansible"
+
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            ./get_binaries.sh
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            RELEASE=true ./get_binaries.sh
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+
+          eval $(ssh-agent)
+          echo "${{ secrets.TELEPORT_SSH_KEY }}"  | tr -d '\n'| base64 --decode >ssh-key
+          echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
+          chmod 0600 ssh-key
+          ssh-add ssh-key
+          rm -f ssh-key ssh-key-cert.pub
+          ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
+          ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i production.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          rm -f neon_install.tar.gz .neon_current_version
+
+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ansible/collections': Permission denied
+      - name: Cleanup ansible folder
+        run: rm -rf ~/.ansible
+
+  deploy-proxy:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployProxy
+    defaults:
+      run:
+        shell: bash
+    environment:
+      name: prod-old
+    env:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Store kubeconfig file
+        run: |
+          echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
+          chmod 0600 ${KUBECONFIG}
+
+      - name: Add neon helm chart
+        run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
+
+      - name: Re-deploy proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
+
+  deploy-storage-broker:
+    name: deploy storage broker on old staging and old prod
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployStorageBroker
+    defaults:
+      run:
+        shell: bash
+    environment:
+      name: prod-old
+    env:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Store kubeconfig file
+        run: |
+          echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
+          chmod 0600 ${KUBECONFIG}
+
+      - name: Add neon helm chart
+        run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
+
+      - name: Deploy storage-broker
+        run:
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/production.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000000..49e04ee001
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,33 @@
+name: Create Release Branch
+
+on:
+  schedule:
+    - cron: '0 10 * * 2'
+
+jobs:
+  create_release_branch:
+    runs-on: [ubuntu-latest]
+
+    steps:
+    - name: Check out code
+      uses: actions/checkout@v3
+      with:
+        ref: main
+
+    - name: Get current date
+      id: date
+      run: echo "date=(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
+
+    - name: Create release branch
+      run: git checkout -b release/${{ steps.date.outputs.date }}
+
+    - name: Push new branch
+      run: git push origin release/${{ steps.date.outputs.date }}
+
+    - name: Create pull request into release
+      uses: thomaseizinger/create-pull-request@e3972219c86a56550fb70708d96800d8e24ba862 # 1.3.0
+      with:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        head: release/${{ steps.date.outputs.date }}
+        base: release
+        title: Release ${{ steps.date.outputs.date }}
diff --git a/Cargo.lock b/Cargo.lock
index d8aba9ba68..2985a654f3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -37,11 +37,6 @@ dependencies = [
  "memchr",
 ]
 
-[[package]]
-name = "amplify_num"
-version = "0.4.1"
-source = "git+https://github.com/rust-amplify/rust-amplify.git?tag=v4.0.0-beta.1#3ad006cf2804e1862ec7725a7684a493f3023523"
-
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
@@ -66,6 +61,15 @@ dependencies = [
  "backtrace",
 ]
 
+[[package]]
+name = "archery"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a8da9bc4c4053ee067669762bcaeea6e241841295a2b6c948312dad6ef4cc02"
+dependencies = [
+ "static_assertions",
+]
+
 [[package]]
 name = "asn1-rs"
 version = "0.5.1"
@@ -137,15 +141,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "atomic-polyfill"
-version = "0.1.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3ff7eb3f316534d83a8a2c3d1674ace8a5a71198eba31e2e2b597833f699b28"
-dependencies = [
- "critical-section",
-]
-
 [[package]]
 name = "atty"
 version = "0.2.14"
@@ -629,9 +624,9 @@ dependencies = [
 
 [[package]]
 name = "bumpalo"
-version = "3.11.1"
+version = "3.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba"
+checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
 
 [[package]]
 name = "byteorder"
@@ -750,13 +745,13 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.0.32"
+version = "4.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7db700bc935f9e43e88d00b0850dae18a63773cfbec6d8e070fccf7fef89a39"
+checksum = "4ec7a4128863c188deefe750ac1d1dfe66c236909f845af04beed823638dc1b2"
 dependencies = [
  "bitflags",
  "clap_derive",
- "clap_lex 0.3.0",
+ "clap_lex 0.3.1",
  "is-terminal",
  "once_cell",
  "strsim",
@@ -765,9 +760,9 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.0.21"
+version = "4.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0177313f9f02afc995627906bbd8967e2be069f5261954222dac78290c2b9014"
+checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8"
 dependencies = [
  "heck",
  "proc-macro-error",
@@ -787,9 +782,9 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.3.0"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d4198f73e42b4936b35b5bb248d81d2b595ecb170da0bac7655c54eedfa8da8"
+checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade"
 dependencies = [
  "os_str_bytes",
 ]
@@ -832,10 +827,11 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
  "futures",
  "hyper",
  "notify",
+ "opentelemetry",
  "postgres",
  "regex",
  "serde",
@@ -844,7 +840,9 @@ dependencies = [
  "tokio",
  "tokio-postgres",
  "tracing",
+ "tracing-opentelemetry",
  "tracing-subscriber",
+ "tracing-utils",
  "url",
  "workspace_hack",
 ]
@@ -887,7 +885,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.0.32",
+ "clap 4.1.1",
  "comfy-table",
  "git-version",
  "nix",
@@ -988,12 +986,6 @@ dependencies = [
  "itertools",
 ]
 
-[[package]]
-name = "critical-section"
-version = "1.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
-
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.6"
@@ -1030,12 +1022,11 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-utils"
-version = "0.8.11"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc"
+checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f"
 dependencies = [
  "cfg-if",
- "once_cell",
 ]
 
 [[package]]
@@ -1152,6 +1143,19 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "dashmap"
+version = "5.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
+dependencies = [
+ "cfg-if",
+ "hashbrown 0.12.3",
+ "lock_api",
+ "once_cell",
+ "parking_lot_core",
+]
+
 [[package]]
 name = "data-encoding"
 version = "2.3.3"
@@ -1506,15 +1510,6 @@ version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
 
-[[package]]
-name = "hash32"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67"
-dependencies = [
- "byteorder",
-]
-
 [[package]]
 name = "hashbrown"
 version = "0.12.3"
@@ -1530,19 +1525,6 @@ dependencies = [
  "ahash",
 ]
 
-[[package]]
-name = "heapless"
-version = "0.7.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db04bc24a18b9ea980628ecf00e6c0264f3c1426dac36c00cb49b6fbad8b0743"
-dependencies = [
- "atomic-polyfill",
- "hash32",
- "rustc_version",
- "spin 0.9.4",
- "stable_deref_trait",
-]
-
 [[package]]
 name = "heck"
 version = "0.4.0"
@@ -1804,9 +1786,9 @@ dependencies = [
 
 [[package]]
 name = "io-lifetimes"
-version = "1.0.3"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46112a93252b123d31a119a8d1a1ac19deac4fac6e0e8b0df58f0d4e5870e63c"
+checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e"
 dependencies = [
  "libc",
  "windows-sys",
@@ -1916,12 +1898,6 @@ dependencies = [
  "winapi",
 ]
 
-[[package]]
-name = "libm"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb"
-
 [[package]]
 name = "link-cplusplus"
 version = "1.0.8"
@@ -2067,9 +2043,9 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 
 [[package]]
 name = "nix"
-version = "0.26.1"
+version = "0.26.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46a58d1d356c6597d08cde02c2f09d785b09e28711837b1ed667dc652c08a694"
+checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a"
 dependencies = [
  "bitflags",
  "cfg-if",
@@ -2081,9 +2057,9 @@ dependencies = [
 
 [[package]]
 name = "nom"
-version = "7.1.2"
+version = "7.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5507769c4919c998e69e49c839d9dc6e693ede4cc4290d6ad8b41d4f09c548c"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
 dependencies = [
  "memchr",
  "minimal-lexical",
@@ -2154,7 +2130,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
 dependencies = [
  "autocfg",
- "libm",
 ]
 
 [[package]]
@@ -2203,6 +2178,108 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
+[[package]]
+name = "opentelemetry"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e"
+dependencies = [
+ "opentelemetry_api",
+ "opentelemetry_sdk",
+]
+
+[[package]]
+name = "opentelemetry-http"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "http",
+ "opentelemetry_api",
+ "reqwest",
+]
+
+[[package]]
+name = "opentelemetry-otlp"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d1c928609d087790fc936a1067bdc310ae702bdf3b090c3f281b713622c8bbde"
+dependencies = [
+ "async-trait",
+ "futures",
+ "futures-util",
+ "http",
+ "opentelemetry",
+ "opentelemetry-http",
+ "opentelemetry-proto",
+ "prost",
+ "reqwest",
+ "thiserror",
+]
+
+[[package]]
+name = "opentelemetry-proto"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d61a2f56df5574508dd86aaca016c917489e589ece4141df1b5e349af8d66c28"
+dependencies = [
+ "futures",
+ "futures-util",
+ "opentelemetry",
+ "prost",
+ "tonic",
+ "tonic-build",
+]
+
+[[package]]
+name = "opentelemetry-semantic-conventions"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb"
+dependencies = [
+ "opentelemetry",
+]
+
+[[package]]
+name = "opentelemetry_api"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22"
+dependencies = [
+ "fnv",
+ "futures-channel",
+ "futures-util",
+ "indexmap",
+ "js-sys",
+ "once_cell",
+ "pin-project-lite",
+ "thiserror",
+]
+
+[[package]]
+name = "opentelemetry_sdk"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113"
+dependencies = [
+ "async-trait",
+ "crossbeam-channel",
+ "dashmap",
+ "fnv",
+ "futures-channel",
+ "futures-executor",
+ "futures-util",
+ "once_cell",
+ "opentelemetry_api",
+ "percent-encoding",
+ "rand",
+ "thiserror",
+ "tokio",
+ "tokio-stream",
+]
+
 [[package]]
 name = "os_info"
 version = "3.5.1"
@@ -2230,14 +2307,13 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 name = "pageserver"
 version = "0.1.0"
 dependencies = [
- "amplify_num",
  "anyhow",
  "async-stream",
  "async-trait",
  "byteorder",
  "bytes",
  "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
  "close_fds",
  "const_format",
  "consumption_metrics",
@@ -2269,7 +2345,7 @@ dependencies = [
  "regex",
  "remote_storage",
  "reqwest",
- "rstar",
+ "rpds",
  "scopeguard",
  "serde",
  "serde_json",
@@ -2581,9 +2657,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.49"
+version = "1.0.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "57a8eca9f9c4ffde41714334dee777596264c7825420f521abc92b5b5deb63a5"
+checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2"
 dependencies = [
  "unicode-ident",
 ]
@@ -2683,7 +2759,7 @@ dependencies = [
  "bstr",
  "bytes",
  "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
  "consumption_metrics",
  "futures",
  "git-version",
@@ -2742,14 +2818,13 @@ dependencies = [
 
 [[package]]
 name = "rand"
-version = "0.8.4"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
 dependencies = [
  "libc",
  "rand_chacha",
  "rand_core",
- "rand_hc",
 ]
 
 [[package]]
@@ -2771,15 +2846,6 @@ dependencies = [
  "getrandom",
 ]
 
-[[package]]
-name = "rand_hc"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7"
-dependencies = [
- "rand_core",
-]
-
 [[package]]
 name = "rayon"
 version = "1.6.1"
@@ -2930,7 +2996,7 @@ dependencies = [
  "cc",
  "libc",
  "once_cell",
- "spin 0.5.2",
+ "spin",
  "untrusted",
  "web-sys",
  "winapi",
@@ -2950,14 +3016,12 @@ dependencies = [
 ]
 
 [[package]]
-name = "rstar"
-version = "0.9.3"
+name = "rpds"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b40f1bfe5acdab44bc63e6699c28b74f75ec43afb59f3eda01e145aff86a25fa"
+checksum = "66262ea963eff99163e6b741fbc3417a52cc13074728c1047e9911789df9b000"
 dependencies = [
- "heapless",
- "num-traits",
- "smallvec",
+ "archery",
 ]
 
 [[package]]
@@ -3018,9 +3082,9 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.36.6"
+version = "0.36.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4feacf7db682c6c329c4ede12649cd36ecab0f3be5b7d74e6a20304725db4549"
+checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03"
 dependencies = [
  "bitflags",
  "errno",
@@ -3093,7 +3157,7 @@ dependencies = [
  "async-trait",
  "byteorder",
  "bytes",
- "clap 4.0.32",
+ "clap 4.1.1",
  "const_format",
  "crc32c",
  "fs2",
@@ -3479,21 +3543,6 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
 
-[[package]]
-name = "spin"
-version = "0.9.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09"
-dependencies = [
- "lock_api",
-]
-
-[[package]]
-name = "stable_deref_trait"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
-
 [[package]]
 name = "static_assertions"
 version = "1.1.0"
@@ -3507,7 +3556,7 @@ dependencies = [
  "anyhow",
  "async-stream",
  "bytes",
- "clap 4.0.32",
+ "clap 4.1.1",
  "const_format",
  "futures",
  "futures-core",
@@ -3639,9 +3688,9 @@ dependencies = [
 
 [[package]]
 name = "termcolor"
-version = "1.1.3"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755"
+checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
 dependencies = [
  "winapi-util",
 ]
@@ -3749,9 +3798,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.24.1"
+version = "1.24.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d9f76183f91ecfb55e1d7d5602bd1d979e38a3a522fe900241cf195624d67ae"
+checksum = "597a12a59981d9e3c38d216785b0c37399f6e415e8d0712047620f189371b0bb"
 dependencies = [
  "autocfg",
  "bytes",
@@ -4071,6 +4120,20 @@ dependencies = [
  "tracing-core",
 ]
 
+[[package]]
+name = "tracing-opentelemetry"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de"
+dependencies = [
+ "once_cell",
+ "opentelemetry",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "tracing-serde"
 version = "0.1.3"
@@ -4102,6 +4165,22 @@ dependencies = [
  "tracing-serde",
 ]
 
+[[package]]
+name = "tracing-utils"
+version = "0.1.0"
+dependencies = [
+ "hyper",
+ "opentelemetry",
+ "opentelemetry-otlp",
+ "opentelemetry-semantic-conventions",
+ "reqwest",
+ "tokio",
+ "tracing",
+ "tracing-opentelemetry",
+ "tracing-subscriber",
+ "workspace_hack",
+]
+
 [[package]]
 name = "try-lock"
 version = "0.2.4"
@@ -4183,9 +4262,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
 
 [[package]]
 name = "ureq"
-version = "2.6.1"
+version = "2.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "733b5ad78377302af52c0dbcb2623d78fe50e4b3bf215948ff29e9ee031d8566"
+checksum = "338b31dd1314f68f3aabf3ed57ab922df95ffcd902476ca7ba3c4ce7b908c46d"
 dependencies = [
  "base64 0.13.1",
  "log",
@@ -4226,6 +4305,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "async-trait",
+ "atty",
  "bincode",
  "byteorder",
  "bytes",
@@ -4287,7 +4367,7 @@ name = "wal_craft"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.0.32",
+ "clap 4.1.1",
  "env_logger",
  "log",
  "once_cell",
@@ -4534,11 +4614,13 @@ dependencies = [
  "anyhow",
  "bytes",
  "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
  "crossbeam-utils",
  "either",
  "fail",
+ "futures",
  "futures-channel",
+ "futures-executor",
  "futures-task",
  "futures-util",
  "indexmap",
@@ -4554,6 +4636,9 @@ dependencies = [
  "rand",
  "regex",
  "regex-syntax",
+ "reqwest",
+ "ring",
+ "rustls",
  "scopeguard",
  "serde",
  "serde_json",
@@ -4561,6 +4646,7 @@ dependencies = [
  "syn",
  "tokio",
  "tokio-util",
+ "tonic",
  "tower",
  "tracing",
  "tracing-core",
diff --git a/Cargo.toml b/Cargo.toml
index 74cc16d690..e6695c4246 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -61,6 +61,10 @@ nix = "0.26"
 notify = "5.0.0"
 num-traits = "0.2.15"
 once_cell = "1.13"
+opentelemetry = "0.18.0"
+opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.10.0"
+tracing-opentelemetry = "0.18.0"
 parking_lot = "0.12"
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
@@ -69,7 +73,7 @@ rand = "0.8"
 regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 routerify = "3"
-rstar = "0.9.3"
+rpds = "0.12.0"
 rustls = "0.20"
 rustls-pemfile = "1"
 rustls-split = "0.3"
@@ -107,9 +111,6 @@ x509-parser = "0.14"
 env_logger = "0.10"
 log = "0.4"
 
-## TODO switch when the new release is made
-amplify_num = { git = "https://github.com/rust-amplify/rust-amplify.git", tag = "v4.0.0-beta.1" }
-
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
@@ -128,6 +129,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
 safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
 storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
+tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 
 ## Common library dependency
diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node
similarity index 86%
rename from Dockerfile.compute-node-v14
rename to Dockerfile.compute-node
index 2deb95a93f..936f368833 100644
--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node
@@ -1,8 +1,5 @@
-#
-# This file is identical to the Dockerfile.compute-node-v15 file
-# except for the version of Postgres that is built.
-#
-
+ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG IMAGE=rust
 ARG TAG=pinned
 
 #########################################################################################
@@ -22,7 +19,8 @@ RUN apt update &&  \
 #
 #########################################################################################
 FROM build-deps AS pg-build
-COPY vendor/postgres-v14 postgres
+ARG PG_VERSION
+COPY vendor/postgres-${PG_VERSION} postgres
 RUN cd postgres && \
     ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
@@ -135,6 +133,27 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
 
+#########################################################################################
+#
+# Layer "unit-pg-build"
+# compile unit extension
+#
+#########################################################################################
+FROM build-deps AS unit-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \
+    tar xvzf 7.7.tar.gz && \
+    cd postgresql-unit-7.7 && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
+    # We move the extension from '/usr/local/pgsql/' to '/usr/local/'  after it is build. So we need to adjust the path.
+    # This one-liner removes pgsql/ part of the path.
+    # NOTE: Other extensions that rely on MODULEDIR variable after building phase will need the same fix.
+    find /usr/local/pgsql/share/extension/ -name "unit*.sql" -print0 | xargs -0 sed -i "s|pgsql/||g" && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/unit.control
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -146,6 +165,7 @@ COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /h3/usr /
+COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -158,7 +178,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
 # Compile and run the Neon-specific `compute_ctl` binary
 #
 #########################################################################################
-FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
+FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15
deleted file mode 100644
index 8647ce2bf4..0000000000
--- a/Dockerfile.compute-node-v15
+++ /dev/null
@@ -1,220 +0,0 @@
-#
-# This file is identical to the Dockerfile.compute-node-v14 file
-# except for the version of Postgres that is built.
-#
-
-ARG TAG=pinned
-
-#########################################################################################
-#
-# Layer "build-deps"
-#
-#########################################################################################
-FROM debian:bullseye-slim AS build-deps
-RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
-    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
-
-#########################################################################################
-#
-# Layer "pg-build"
-# Build Postgres from the neon postgres repository.
-#
-#########################################################################################
-FROM build-deps AS pg-build
-COPY vendor/postgres-v15 postgres
-RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
-    # Install headers
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
-    # Enable some of contrib extensions
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control
-
-#########################################################################################
-#
-# Layer "postgis-build"
-# Build PostGIS from the upstream PostGIS mirror.
-#
-#########################################################################################
-FROM build-deps AS postgis-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-RUN apt update && \
-    apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc
-
-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
-    tar xvzf postgis-3.3.1.tar.gz && \
-    cd postgis-3.3.1 && \
-    ./autogen.sh && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
-    ./configure && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    cd extensions/postgis && \
-    make clean && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
-
-#########################################################################################
-#
-# Layer "plv8-build"
-# Build plv8
-#
-#########################################################################################
-FROM build-deps AS plv8-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
-
-# https://github.com/plv8/plv8/issues/475:
-#   v8 uses gold for linking and sets `--thread-count=4` which breaks
-#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
-# Install newer gold version manually as debian-testing binutils version updates
-# libc version, which in turn breaks other extension built against non-testing libc.
-RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
-    tar xvzf binutils-2.38.tar.gz && \
-    cd binutils-2.38 && \
-    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
-    cd ../bfd && ./configure && make bfdver.h && \
-    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
-    cp /usr/local/bin/ld.gold /usr/bin/gold
-
-# Sed is used to patch for https://github.com/plv8/plv8/issues/503
-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
-    tar xvzf v3.1.4.tar.gz && \
-    cd plv8-3.1.4 && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
-    sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
-    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
-    rm -rf /plv8-* && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
-
-#########################################################################################
-#
-# Layer "h3-pg-build"
-# Build h3_pg
-#
-#########################################################################################
-FROM build-deps AS h3-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-# packaged cmake is too old
-RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
-      -q -O /tmp/cmake-install.sh \
-      && chmod u+x /tmp/cmake-install.sh \
-      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
-      && rm /tmp/cmake-install.sh
-
-RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
-    tar xvzf h3.tgz  && \
-    cd h3-4.0.1 && \
-    mkdir build && \
-    cd build && \
-    cmake .. -DCMAKE_BUILD_TYPE=Release && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    DESTDIR=/h3 make install && \
-    cp -R /h3/usr / && \
-    rm -rf build
-
-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
-    tar xvzf h3-pg.tgz && \
-    cd h3-pg-4.0.1 && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
-
-#########################################################################################
-#
-# Layer "neon-pg-ext-build"
-# compile neon extensions
-#
-#########################################################################################
-FROM build-deps AS neon-pg-ext-build
-COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=h3-pg-build /h3/usr /
-COPY pgxn/ pgxn/
-
-RUN make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/neon \
-        -s install
-
-#########################################################################################
-#
-# Compile and run the Neon-specific `compute_ctl` binary
-#
-#########################################################################################
-FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
-USER nonroot
-# Copy entire project to get Cargo.* files with proper dependencies for the whole project
-COPY --chown=nonroot . .
-RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
-
-#########################################################################################
-#
-# Clean up postgres folder before inclusion
-#
-#########################################################################################
-FROM neon-pg-ext-build AS postgres-cleanup-layer
-COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
-
-# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
-RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
-
-# Remove headers that we won't need anymore - we've completed installation of all extensions
-RUN rm -r /usr/local/pgsql/include
-
-# Remove static postgresql libraries - all compilation is finished, so we
-# can now remove these files - they must be included in other binaries by now
-# if they were to be used by other libraries.
-RUN rm /usr/local/pgsql/lib/lib*.a
-
-#########################################################################################
-#
-# Final layer
-# Put it all together into the final image
-#
-#########################################################################################
-FROM debian:bullseye-slim
-# Add user postgres
-RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
-    echo "postgres:test_console_pass" | chpasswd && \
-    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
-    chown -R postgres:postgres /var/db/postgres && \
-    chmod 0750 /var/db/postgres/compute && \
-    echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
-
-COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
-COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
-
-# Install:
-# libreadline8 for psql
-# libossp-uuid16 for extension ossp-uuid
-# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
-RUN apt update &&  \
-    apt install --no-install-recommends -y \
-        libreadline8 \
-        libossp-uuid16 \
-        libgeos-c1v5 \
-        libgdal28 \
-        libproj19 \
-        libprotobuf-c1 \
-        gdb && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-USER postgres
-ENTRYPOINT ["/usr/local/bin/compute_ctl"]
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 4536604bdf..f8c3481f57 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -11,6 +11,7 @@ clap.workspace = true
 futures.workspace = true
 hyper = { workspace = true, features = ["full"] }
 notify.workspace = true
+opentelemetry.workspace = true
 postgres.workspace = true
 regex.workspace = true
 serde.workspace = true
@@ -19,7 +20,9 @@ tar.workspace = true
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
 tracing.workspace = true
+tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
+tracing-utils.workspace = true
 url.workspace = true
 
 workspace_hack.workspace = true
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index e5ab8eb153..2c42662020 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -53,7 +53,7 @@ use compute_tools::spec::*;
 use url::Url;
 
 fn main() -> Result<()> {
-    init_logger(DEFAULT_LOG_LEVEL)?;
+    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
 
     let matches = cli().get_matches();
 
@@ -84,6 +84,29 @@ fn main() -> Result<()> {
         }
     };
 
+    // Extract OpenTelemetry context for the startup actions from the spec, and
+    // attach it to the current tracing context.
+    //
+    // This is used to propagate the context for the 'start_compute' operation
+    // from the neon control plane. This allows linking together the wider
+    // 'start_compute' operation that creates the compute container, with the
+    // startup actions here within the container.
+    //
+    // Switch to the startup context here, and exit it once the startup has
+    // completed and Postgres is up and running.
+    //
+    // NOTE: This is supposed to only cover the *startup* actions. Once
+    // postgres is configured and up-and-running, we exit this span. Any other
+    // actions that are performed on incoming HTTP requests, for example, are
+    // performed in separate spans.
+    let startup_context_guard = if let Some(ref carrier) = spec.startup_tracing_context {
+        use opentelemetry::propagation::TextMapPropagator;
+        use opentelemetry::sdk::propagation::TraceContextPropagator;
+        Some(TraceContextPropagator::new().extract(carrier).attach())
+    } else {
+        None
+    };
+
     let pageserver_connstr = spec
         .cluster
         .settings
@@ -140,6 +163,9 @@ fn main() -> Result<()> {
     // Wait for the child Postgres process forever. In this state Ctrl+C will
     // propagate to Postgres and it will be shut down as well.
     if let Some(mut pg) = pg {
+        // Startup is finished, exit the startup tracing span
+        drop(startup_context_guard);
+
         let ecode = pg
             .wait()
             .expect("failed to start waiting on Postgres process");
@@ -159,6 +185,10 @@ fn main() -> Result<()> {
         info!("shutting down");
     }
 
+    // Shutdown trace pipeline gracefully, so that it has a chance to send any
+    // pending traces before we exit.
+    tracing_utils::shutdown_tracing();
+
     exit(exit_code.unwrap_or(1))
 }
 
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index f2a49f332c..589a8e1434 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -3,16 +3,21 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;
 
+use crate::compute::ComputeNode;
 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use serde_json;
 use tracing::{error, info};
-
-use crate::compute::ComputeNode;
+use tracing_utils::http::OtelName;
 
 // Service function to handle all available routes.
-async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body> {
+async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
+    //
+    // NOTE: The URI path is currently included in traces. That's OK because
+    // it doesn't contain any variable parts or sensitive information. But
+    // please keep that in mind if you change the routing here.
+    //
     match (req.method(), req.uri().path()) {
         // Serialized compute state.
         (&Method::GET, "/status") => {
@@ -30,7 +35,7 @@ async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body>
 
         (&Method::POST, "/check_writability") => {
             info!("serving /check_writability POST request");
-            let res = crate::checker::check_writability(&compute).await;
+            let res = crate::checker::check_writability(compute).await;
             match res {
                 Ok(_) => Response::new(Body::from("true")),
                 Err(e) => Response::new(Body::from(e.to_string())),
@@ -56,7 +61,19 @@ async fn serve(state: Arc<ComputeNode>) {
         async move {
             Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
                 let state = state.clone();
-                async move { Ok::<_, Infallible>(routes(req, state).await) }
+                async move {
+                    Ok::<_, Infallible>(
+                        // NOTE: We include the URI path in the string. It
+                        // doesn't contain any variable parts or sensitive
+                        // information in this API.
+                        tracing_utils::http::tracing_handler(
+                            req,
+                            |req| routes(req, &state),
+                            OtelName::UriPath,
+                        )
+                        .await,
+                    )
+                }
             }))
         }
     });
diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs
index 57e5496e86..1b5cf647b0 100644
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -1,21 +1,37 @@
-use anyhow::Result;
+use tracing_opentelemetry::OpenTelemetryLayer;
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::prelude::*;
 
-/// Initialize `env_logger` using either `default_level` or
+/// Initialize logging to stderr, and OpenTelemetry tracing and exporter.
+///
+/// Logging is configured using either `default_log_level` or
 /// `RUST_LOG` environment variable as default log level.
-pub fn init_logger(default_level: &str) -> Result<()> {
+///
+/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up
+/// configuration from environment variables. For example, to change the destination,
+/// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See
+/// `tracing-utils` package description.
+///
+pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
+    // Initialize Logging
     let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
-        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_level));
+        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
 
     let fmt_layer = tracing_subscriber::fmt::layer()
         .with_target(false)
         .with_writer(std::io::stderr);
 
+    // Initialize OpenTelemetry
+    let otlp_layer =
+        tracing_utils::init_tracing_without_runtime("compute_ctl").map(OpenTelemetryLayer::new);
+
+    // Put it all together
     tracing_subscriber::registry()
         .with(env_filter)
+        .with(otlp_layer)
         .with(fmt_layer)
         .init();
+    tracing::info!("logging and tracing started");
 
     Ok(())
 }
diff --git a/compute_tools/src/params.rs b/compute_tools/src/params.rs
index 925a2f8ef3..0ce01ff478 100644
--- a/compute_tools/src/params.rs
+++ b/compute_tools/src/params.rs
@@ -1,3 +1,9 @@
 pub const DEFAULT_LOG_LEVEL: &str = "info";
-pub const DEFAULT_CONNSTRING: &str = "host=localhost user=postgres";
+// From Postgres docs:
+//   To ease transition from the md5 method to the newer SCRAM method, if md5 is specified
+//   as a method in pg_hba.conf but the user's password on the server is encrypted for SCRAM
+//   (see below), then SCRAM-based authentication will automatically be chosen instead.
+//   https://www.postgresql.org/docs/15/auth-password.html
+//
+// So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles.
 pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 97cd623052..bbd0ec21ed 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,3 +1,4 @@
+use std::collections::HashMap;
 use std::path::Path;
 use std::str::FromStr;
 
@@ -22,6 +23,8 @@ pub struct ComputeSpec {
     /// Expected cluster state at the end of transition process.
     pub cluster: Cluster,
     pub delta_operations: Option<Vec<DeltaOp>>,
+
+    pub startup_tracing_context: Option<HashMap<String, String>>,
 }
 
 /// Cluster state seen from the perspective of the external tools
@@ -152,8 +155,20 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
             {
                 RoleAction::Update
             } else if let Some(pg_pwd) = &r.encrypted_password {
-                // Check whether password changed or not (trim 'md5:' prefix first)
-                if pg_pwd[3..] != *role.encrypted_password.as_ref().unwrap() {
+                // Check whether password changed or not (trim 'md5' prefix first if any)
+                //
+                // This is a backward compatibility hack, which comes from the times when we were using
+                // md5 for everyone and hashes were stored in the console db without md5 prefix. So when
+                // role comes from the control-plane (json spec) `Role.encrypted_password` doesn't have md5 prefix,
+                // but when role comes from Postgres (`get_existing_roles` / `existing_roles`) it has this prefix.
+                // Here is the only place so far where we compare hashes, so it seems to be the best candidate
+                // to place this compatibility layer.
+                let pg_pwd = if let Some(stripped) = pg_pwd.strip_prefix("md5") {
+                    stripped
+                } else {
+                    pg_pwd
+                };
+                if pg_pwd != *role.encrypted_password.as_ref().unwrap() {
                     RoleAction::Update
                 } else {
                     RoleAction::None
@@ -372,13 +387,13 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                     name.pg_quote(),
                     db.owner.pg_quote()
                 );
-                let _ = info_span!("executing", query).entered();
+                let _guard = info_span!("executing", query).entered();
                 client.execute(query.as_str(), &[])?;
             }
             DatabaseAction::Create => {
                 let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
                 query.push_str(&db.to_pg_options());
-                let _ = info_span!("executing", query).entered();
+                let _guard = info_span!("executing", query).entered();
                 client.execute(query.as_str(), &[])?;
             }
         };
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 880ab0e83c..07d220195b 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -8,6 +8,7 @@ pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::{core, default_registry, proto};
 pub use prometheus::{exponential_buckets, linear_buckets};
+pub use prometheus::{register_counter_vec, Counter, CounterVec};
 pub use prometheus::{register_gauge, Gauge};
 pub use prometheus::{register_gauge_vec, GaugeVec};
 pub use prometheus::{register_histogram, Histogram};
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index b5027cb331..0d7aa2db55 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -29,6 +29,14 @@ pub enum TenantState {
     Broken,
 }
 
+pub mod state {
+    pub const LOADING: &str = "loading";
+    pub const ATTACHING: &str = "attaching";
+    pub const ACTIVE: &str = "active";
+    pub const STOPPING: &str = "stopping";
+    pub const BROKEN: &str = "broken";
+}
+
 impl TenantState {
     pub fn has_in_progress_downloads(&self) -> bool {
         match self {
@@ -39,23 +47,32 @@ impl TenantState {
             Self::Broken => false,
         }
     }
+
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            TenantState::Loading => state::LOADING,
+            TenantState::Attaching => state::ATTACHING,
+            TenantState::Active => state::ACTIVE,
+            TenantState::Stopping => state::STOPPING,
+            TenantState::Broken => state::BROKEN,
+        }
+    }
 }
 
 /// A state of a timeline in pageserver's memory.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TimelineState {
-    /// Timeline is fully operational. If the containing Tenant is Active, the timeline's
-    /// background jobs are running otherwise they will be launched when the tenant is activated.
+    /// The timeline is recognized by the pageserver but is not yet operational.
+    /// In particular, the walreceiver connection loop is not running for this timeline.
+    /// It will eventually transition to state Active or Broken.
+    Loading,
+    /// The timeline is fully operational.
+    /// It can be queried, and the walreceiver connection loop is running.
     Active,
-    /// A timeline is recognized by pageserver, but not yet ready to operate.
-    /// The status indicates, that the timeline could eventually go back to Active automatically:
-    /// for example, if the owning tenant goes back to Active again.
-    Suspended,
-    /// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to
-    /// automatically become Active after certain events: only a management call can change this status.
+    /// The timeline was previously Loading or Active but is shutting down.
+    /// It cannot transition back into any other state.
     Stopping,
-    /// A timeline is recognized by the pageserver, but can no longer be used for
-    /// any operations, because it failed to be activated.
+    /// The timeline is broken and not operational (previous states: Loading or Active).
     Broken,
 }
 
diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml
new file mode 100644
index 0000000000..8c3d3f9063
--- /dev/null
+++ b/libs/tracing-utils/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "tracing-utils"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+hyper.workspace = true
+opentelemetry = { workspace = true, features=["rt-tokio"] }
+opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions.workspace = true
+reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
+tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
+tracing.workspace = true
+tracing-opentelemetry.workspace = true
+tracing-subscriber.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/tracing-utils/src/http.rs b/libs/tracing-utils/src/http.rs
new file mode 100644
index 0000000000..3f80f49de1
--- /dev/null
+++ b/libs/tracing-utils/src/http.rs
@@ -0,0 +1,96 @@
+//! Tracing wrapper for Hyper HTTP server
+
+use hyper::HeaderMap;
+use hyper::{Body, Request, Response};
+use std::future::Future;
+use tracing::Instrument;
+use tracing_opentelemetry::OpenTelemetrySpanExt;
+
+/// Configuration option for what to use as the "otel.name" field in the traces.
+pub enum OtelName<'a> {
+    /// Use a constant string
+    Constant(&'a str),
+
+    /// Use the path from the request.
+    ///
+    /// That's very useful information, but is not appropriate if the
+    /// path contains parameters that differ on ever request, or worse,
+    /// sensitive information like usernames or email addresses.
+    ///
+    /// See <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/semantic_conventions/http.md#name>
+    UriPath,
+}
+
+/// Handle an incoming HTTP request using the given handler function,
+/// with OpenTelemetry tracing.
+///
+/// This runs 'handler' on the request in a new span, with fields filled in
+/// from the request. Notably, if the request contains tracing information,
+/// it is propagated to the span, so that this request is traced as part of
+/// the same trace.
+///
+/// XXX: Usually, this is handled by existing libraries, or built
+/// directly into HTTP servers. However, I couldn't find one for Hyper,
+/// so I had to write our own. OpenTelemetry website has a registry of
+/// instrumentation libraries at:
+/// https://opentelemetry.io/registry/?language=rust&component=instrumentation
+/// If a Hyper crate appears, consider switching to that.
+pub async fn tracing_handler<F, R>(
+    req: Request<Body>,
+    handler: F,
+    otel_name: OtelName<'_>,
+) -> Response<Body>
+where
+    F: Fn(Request<Body>) -> R,
+    R: Future<Output = Response<Body>>,
+{
+    // Create a tracing span, with context propagated from the incoming
+    // request if any.
+    //
+    // See list of standard fields defined for HTTP requests at
+    // https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/semantic_conventions/http.md
+    // We only fill in a few of the most useful ones here.
+    let otel_name = match otel_name {
+        OtelName::Constant(s) => s,
+        OtelName::UriPath => req.uri().path(),
+    };
+
+    let span = tracing::info_span!(
+        "http request",
+        otel.name= %otel_name,
+        http.method = %req.method(),
+        http.status_code = tracing::field::Empty,
+    );
+    let parent_ctx = extract_remote_context(req.headers());
+    span.set_parent(parent_ctx);
+
+    // Handle the request within the span
+    let response = handler(req).instrument(span.clone()).await;
+
+    // Fill in the fields from the response code
+    let status = response.status();
+    span.record("http.status_code", status.as_str());
+    span.record(
+        "otel.status_code",
+        if status.is_success() { "OK" } else { "ERROR" },
+    );
+
+    response
+}
+
+// Extract remote tracing context from the HTTP headers
+fn extract_remote_context(headers: &HeaderMap) -> opentelemetry::Context {
+    struct HeaderExtractor<'a>(&'a HeaderMap);
+
+    impl<'a> opentelemetry::propagation::Extractor for HeaderExtractor<'a> {
+        fn get(&self, key: &str) -> Option<&str> {
+            self.0.get(key).and_then(|value| value.to_str().ok())
+        }
+
+        fn keys(&self) -> Vec<&str> {
+            self.0.keys().map(|value| value.as_str()).collect()
+        }
+    }
+    let extractor = HeaderExtractor(headers);
+    opentelemetry::global::get_text_map_propagator(|propagator| propagator.extract(&extractor))
+}
diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs
new file mode 100644
index 0000000000..de0e2ad799
--- /dev/null
+++ b/libs/tracing-utils/src/lib.rs
@@ -0,0 +1,168 @@
+//! Helper functions to set up OpenTelemetry tracing.
+//!
+//! This comes in two variants, depending on whether you have a Tokio runtime available.
+//! If you do, call `init_tracing()`. It sets up the trace processor and exporter to use
+//! the current tokio runtime. If you don't have a runtime available, or you don't want
+//! to share the runtime with the tracing tasks, call `init_tracing_without_runtime()`
+//! instead. It sets up a dedicated single-threaded Tokio runtime for the tracing tasks.
+//!
+//! Example:
+//!
+//! ```rust,no_run
+//! use tracing_subscriber::prelude::*;
+//! use tracing_opentelemetry::OpenTelemetryLayer;
+//!
+//! #[tokio::main]
+//! async fn main() {
+//!     // Set up logging to stderr
+//!     let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
+//!         .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"));
+//!     let fmt_layer = tracing_subscriber::fmt::layer()
+//!         .with_target(false)
+//!         .with_writer(std::io::stderr);
+//!
+//!     // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces
+//!     let otlp_layer = tracing_utils::init_tracing("my_application").await.map(OpenTelemetryLayer::new);
+//!
+//!     // Put it all together
+//!     tracing_subscriber::registry()
+//!         .with(env_filter)
+//!         .with(otlp_layer)
+//!         .with(fmt_layer)
+//!         .init();
+//! }
+//! ```
+
+use opentelemetry::sdk::Resource;
+use opentelemetry::KeyValue;
+use opentelemetry_otlp::WithExportConfig;
+use opentelemetry_otlp::{OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_TRACES_ENDPOINT};
+
+pub use tracing_opentelemetry::OpenTelemetryLayer;
+
+pub mod http;
+
+/// Set up OpenTelemetry exporter, using configuration from environment variables.
+///
+/// `service_name` is set as the OpenTelemetry 'service.name' resource (see
+/// <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/resource/semantic_conventions/README.md#service>)
+///
+/// We try to follow the conventions for the environment variables specified in
+/// <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables/>
+///
+/// However, we only support a subset of those options:
+///
+/// - OTEL_SDK_DISABLED is supported. The default is "false", meaning tracing
+///   is enabled by default. Set it to "true" to disable.
+///
+/// - We use the OTLP exporter, with HTTP protocol. Most of the OTEL_EXPORTER_OTLP_*
+///   settings specified in
+///   <https://opentelemetry.io/docs/reference/specification/protocol/exporter/>
+///   are supported, as they are handled by the `opentelemetry-otlp` crate.
+///   Settings related to other exporters have no effect.
+///
+/// - Some other settings are supported by the `opentelemetry` crate.
+///
+/// If you need some other setting, please test if it works first. And perhaps
+/// add a comment in the list above to save the effort of testing for the next
+/// person.
+///
+/// This doesn't block, but is marked as 'async' to hint that this must be called in
+/// asynchronous execution context.
+pub async fn init_tracing(service_name: &str) -> Option<opentelemetry::sdk::trace::Tracer> {
+    if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
+        return None;
+    };
+    Some(init_tracing_internal(service_name.to_string()))
+}
+
+/// Like `init_tracing`, but creates a separate tokio Runtime for the tracing
+/// tasks.
+pub fn init_tracing_without_runtime(
+    service_name: &str,
+) -> Option<opentelemetry::sdk::trace::Tracer> {
+    if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
+        return None;
+    };
+
+    // The opentelemetry batch processor and the OTLP exporter needs a Tokio
+    // runtime. Create a dedicated runtime for them. One thread should be
+    // enough.
+    //
+    // (Alternatively, instead of batching, we could use the "simple
+    // processor", which doesn't need Tokio, and use "reqwest-blocking"
+    // feature for the OTLP exporter, which also doesn't need Tokio.  However,
+    // batching is considered best practice, and also I have the feeling that
+    // the non-Tokio codepaths in the opentelemetry crate are less used and
+    // might be more buggy, so better to stay on the well-beaten path.)
+    //
+    // We leak the runtime so that it keeps running after we exit the
+    // function.
+    let runtime = Box::leak(Box::new(
+        tokio::runtime::Builder::new_multi_thread()
+            .enable_all()
+            .thread_name("otlp runtime thread")
+            .worker_threads(1)
+            .build()
+            .unwrap(),
+    ));
+    let _guard = runtime.enter();
+
+    Some(init_tracing_internal(service_name.to_string()))
+}
+
+fn init_tracing_internal(service_name: String) -> opentelemetry::sdk::trace::Tracer {
+    // Set up exporter from the OTEL_EXPORTER_* environment variables
+    let mut exporter = opentelemetry_otlp::new_exporter().http().with_env();
+
+    // XXX opentelemetry-otlp v0.18.0 has a bug in how it uses the
+    // OTEL_EXPORTER_OTLP_ENDPOINT env variable. According to the
+    // OpenTelemetry spec at
+    // <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/exporter.md#endpoint-urls-for-otlphttp>,
+    // the full exporter URL is formed by appending "/v1/traces" to the value
+    // of OTEL_EXPORTER_OTLP_ENDPOINT. However, opentelemetry-otlp only does
+    // that with the grpc-tonic exporter. Other exporters, like the HTTP
+    // exporter, use the URL from OTEL_EXPORTER_OTLP_ENDPOINT as is, without
+    // appending "/v1/traces".
+    //
+    // See https://github.com/open-telemetry/opentelemetry-rust/pull/950
+    //
+    // Work around that by checking OTEL_EXPORTER_OTLP_ENDPOINT, and setting
+    // the endpoint url with the "/v1/traces" path ourselves. If the bug is
+    // fixed in a later version, we can remove this code. But if we don't
+    // remember to remove this, it won't do any harm either, as the crate will
+    // just ignore the OTEL_EXPORTER_OTLP_ENDPOINT setting when the endpoint
+    // is set directly with `with_endpoint`.
+    if std::env::var(OTEL_EXPORTER_OTLP_TRACES_ENDPOINT).is_err() {
+        if let Ok(mut endpoint) = std::env::var(OTEL_EXPORTER_OTLP_ENDPOINT) {
+            if !endpoint.ends_with('/') {
+                endpoint.push('/');
+            }
+            endpoint.push_str("v1/traces");
+            exporter = exporter.with_endpoint(endpoint);
+        }
+    }
+
+    // Propagate trace information in the standard W3C TraceContext format.
+    opentelemetry::global::set_text_map_propagator(
+        opentelemetry::sdk::propagation::TraceContextPropagator::new(),
+    );
+
+    opentelemetry_otlp::new_pipeline()
+        .tracing()
+        .with_exporter(exporter)
+        .with_trace_config(
+            opentelemetry::sdk::trace::config().with_resource(Resource::new(vec![KeyValue::new(
+                opentelemetry_semantic_conventions::resource::SERVICE_NAME,
+                service_name,
+            )])),
+        )
+        .install_batch(opentelemetry::runtime::Tokio)
+        .expect("could not initialize opentelemetry exporter")
+}
+
+// Shutdown trace pipeline gracefully, so that it has a chance to send any
+// pending traces before we exit.
+pub fn shutdown_tracing() {
+    opentelemetry::global::shutdown_tracer_provider();
+}
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 020e4d9dd7..1f6c96bdbe 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,6 +5,7 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
+atty.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index b0ecb746d9..1ba0422993 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -1,6 +1,7 @@
 use hyper::{header, Body, Response, StatusCode};
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
+use tracing::error;
 
 #[derive(Debug, Error)]
 pub enum ApiError {
@@ -76,8 +77,16 @@ impl HttpErrorBody {
 }
 
 pub async fn handler(err: routerify::RouteError) -> Response<Body> {
-    tracing::error!("Error processing HTTP request: {:?}", err);
-    err.downcast::<ApiError>()
-        .expect("handler should always return api error")
-        .into_response()
+    let api_error = err
+        .downcast::<ApiError>()
+        .expect("handler should always return api error");
+
+    // Print a stack trace for Internal Server errors
+    if let ApiError::InternalServerError(_) = api_error.as_ref() {
+        error!("Error processing HTTP request: {api_error:?}");
+    } else {
+        error!("Error processing HTTP request: {api_error:#}");
+    }
+
+    api_error.into_response()
 }
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 82c9267f4a..02684d3d16 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -34,7 +34,7 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
     let base_logger = tracing_subscriber::fmt()
         .with_env_filter(env_filter)
         .with_target(false)
-        .with_ansi(false)
+        .with_ansi(atty::is(atty::Stream::Stdout))
         .with_writer(std::io::stdout);
 
     match log_format {
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index cb9e4478bf..66c25e8576 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -11,7 +11,6 @@ default = []
 testing = ["fail/failpoints"]
 
 [dependencies]
-amplify_num.workspace = true
 anyhow.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
@@ -41,7 +40,6 @@ postgres-protocol.workspace = true
 postgres-types.workspace = true
 rand.workspace = true
 regex.workspace = true
-rstar.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
@@ -68,6 +66,7 @@ tenant_size_model.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
 reqwest.workspace = true
+rpds.workspace = true
 
 [dev-dependencies]
 criterion.workspace = true
diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 6a01fdfc6f..e18c00da96 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,13 +1,12 @@
-use anyhow::Result;
+use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, ValueReconstructState};
-use pageserver::tenant::storage_layer::{Layer, ValueReconstructResult};
+use pageserver::tenant::storage_layer::Layer;
+use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, LayerDescriptor};
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
 use std::io::{BufRead, BufReader};
-use std::ops::Range;
 use std::path::PathBuf;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -17,102 +16,35 @@ use utils::lsn::Lsn;
 
 use criterion::{criterion_group, criterion_main, Criterion};
 
-struct DummyDelta {
-    key_range: Range<Key>,
-    lsn_range: Range<Lsn>,
-}
-
-impl Layer for DummyDelta {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.lsn_range.clone()
-    }
-    fn get_value_reconstruct_data(
-        &self,
-        _key: Key,
-        _lsn_range: Range<Lsn>,
-        _reconstruct_data: &mut ValueReconstructState,
-    ) -> Result<ValueReconstructResult> {
-        panic!()
-    }
-
-    fn is_incremental(&self) -> bool {
-        true
-    }
-
-    fn dump(&self, _verbose: bool) -> Result<()> {
-        unimplemented!()
-    }
-
-    fn short_id(&self) -> String {
-        unimplemented!()
-    }
-}
-
-struct DummyImage {
-    key_range: Range<Key>,
-    lsn: Lsn,
-}
-
-impl Layer for DummyImage {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        // End-bound is exclusive
-        self.lsn..(self.lsn + 1)
-    }
-
-    fn get_value_reconstruct_data(
-        &self,
-        _key: Key,
-        _lsn_range: Range<Lsn>,
-        _reconstruct_data: &mut ValueReconstructState,
-    ) -> Result<ValueReconstructResult> {
-        panic!()
-    }
-
-    fn is_incremental(&self) -> bool {
-        false
-    }
-
-    fn dump(&self, _verbose: bool) -> Result<()> {
-        unimplemented!()
-    }
-
-    fn short_id(&self) -> String {
-        unimplemented!()
-    }
-}
-
-fn build_layer_map(filename_dump: PathBuf) -> LayerMap<dyn Layer> {
-    let mut layer_map = LayerMap::<dyn Layer>::default();
+fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
+    let mut layer_map = LayerMap::<LayerDescriptor>::default();
 
     let mut min_lsn = Lsn(u64::MAX);
     let mut max_lsn = Lsn(0);
 
     let filenames = BufReader::new(File::open(filename_dump).unwrap()).lines();
 
+    let mut updates = layer_map.batch_update();
     for fname in filenames {
         let fname = &fname.unwrap();
         if let Some(imgfilename) = ImageFileName::parse_str(fname) {
-            let layer = DummyImage {
-                key_range: imgfilename.key_range,
-                lsn: imgfilename.lsn,
+            let layer = LayerDescriptor {
+                key: imgfilename.key_range,
+                lsn: imgfilename.lsn..(imgfilename.lsn + 1),
+                is_incremental: false,
+                short_id: fname.to_string(),
             };
-            layer_map.insert_historic(Arc::new(layer));
+            updates.insert_historic(Arc::new(layer));
             min_lsn = min(min_lsn, imgfilename.lsn);
             max_lsn = max(max_lsn, imgfilename.lsn);
         } else if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
-            let layer = DummyDelta {
-                key_range: deltafilename.key_range,
-                lsn_range: deltafilename.lsn_range.clone(),
+            let layer = LayerDescriptor {
+                key: deltafilename.key_range.clone(),
+                lsn: deltafilename.lsn_range.clone(),
+                is_incremental: true,
+                short_id: fname.to_string(),
             };
-            layer_map.insert_historic(Arc::new(layer));
+            updates.insert_historic(Arc::new(layer));
             min_lsn = min(min_lsn, deltafilename.lsn_range.start);
             max_lsn = max(max_lsn, deltafilename.lsn_range.end);
         } else {
@@ -122,11 +54,12 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<dyn Layer> {
 
     println!("min: {min_lsn}, max: {max_lsn}");
 
+    updates.flush();
     layer_map
 }
 
 /// Construct a layer map query pattern for benchmarks
-fn uniform_query_pattern(layer_map: &LayerMap<dyn Layer>) -> Vec<(Key, Lsn)> {
+fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
     // For each image layer we query one of the pages contained, at LSN right
     // before the image layer was created. This gives us a somewhat uniform
     // coverage of both the lsn and key space because image layers have
@@ -150,6 +83,41 @@ fn uniform_query_pattern(layer_map: &LayerMap<dyn Layer>) -> Vec<(Key, Lsn)> {
         .collect()
 }
 
+// Construct a partitioning for testing get_difficulty map when we
+// don't have an exact result of `collect_keyspace` to work with.
+fn uniform_key_partitioning(layer_map: &LayerMap<LayerDescriptor>, _lsn: Lsn) -> KeyPartitioning {
+    let mut parts = Vec::new();
+
+    // We add a partition boundary at the start of each image layer,
+    // no matter what lsn range it covers. This is just the easiest
+    // thing to do. A better thing to do would be to get a real
+    // partitioning from some database. Even better, remove the need
+    // for key partitions by deciding where to create image layers
+    // directly based on a coverage-based difficulty map.
+    let mut keys: Vec<_> = layer_map
+        .iter_historic_layers()
+        .filter_map(|l| {
+            if l.is_incremental() {
+                None
+            } else {
+                let kr = l.get_key_range();
+                Some(kr.start.next())
+            }
+        })
+        .collect();
+    keys.sort();
+
+    let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap();
+    for key in keys {
+        parts.push(KeySpace {
+            ranges: vec![current_key..key],
+        });
+        current_key = key;
+    }
+
+    KeyPartitioning { parts }
+}
+
 // Benchmark using metadata extracted from our performance test environment, from
 // a project where we have run pgbench many timmes. The pgbench database was initialized
 // between each test run.
@@ -183,24 +151,68 @@ fn bench_from_captest_env(c: &mut Criterion) {
 // Benchmark using metadata extracted from a real project that was taknig
 // too long processing layer map queries.
 fn bench_from_real_project(c: &mut Criterion) {
-    // TODO consider compressing this file
+    // Init layer map
+    let now = Instant::now();
     let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
+    println!("Finished layer map init in {:?}", now.elapsed());
+
+    // Choose uniformly distributed queries
     let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
 
-    // Test with uniform query pattern
-    c.bench_function("real_map_uniform_queries", |b| {
+    // Choose inputs for get_difficulty_map
+    let latest_lsn = layer_map
+        .iter_historic_layers()
+        .map(|l| l.get_lsn_range().end)
+        .max()
+        .unwrap();
+    let partitioning = uniform_key_partitioning(&layer_map, latest_lsn);
+
+    // Check correctness of get_difficulty_map
+    // TODO put this in a dedicated test outside of this mod
+    {
+        println!("running correctness check");
+
+        let now = Instant::now();
+        let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning);
+        assert!(result_bruteforce.len() == partitioning.parts.len());
+        println!("Finished bruteforce in {:?}", now.elapsed());
+
+        let now = Instant::now();
+        let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None);
+        assert!(result_fast.len() == partitioning.parts.len());
+        println!("Finished fast in {:?}", now.elapsed());
+
+        // Assert results are equal. Manually iterate for easier debugging.
+        let zip = std::iter::zip(
+            &partitioning.parts,
+            std::iter::zip(result_bruteforce, result_fast),
+        );
+        for (_part, (bruteforce, fast)) in zip {
+            assert_eq!(bruteforce, fast);
+        }
+
+        println!("No issues found");
+    }
+
+    // Define and name the benchmark function
+    let mut group = c.benchmark_group("real_map");
+    group.bench_function("uniform_queries", |b| {
         b.iter(|| {
             for q in queries.clone().into_iter() {
                 layer_map.search(q.0, q.1);
             }
         });
     });
+    group.bench_function("get_difficulty_map", |b| {
+        b.iter(|| {
+            layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3));
+        });
+    });
+    group.finish();
 }
 
 // Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
 fn bench_sequential(c: &mut Criterion) {
-    let mut layer_map: LayerMap<dyn Layer> = LayerMap::default();
-
     // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
     //
     // TODO This code is pretty slow and runs even if we're only running other
@@ -208,39 +220,39 @@ fn bench_sequential(c: &mut Criterion) {
     //      Putting it inside the `bench_function` closure is not a solution
     //      because then it runs multiple times during warmup.
     let now = Instant::now();
+    let mut layer_map = LayerMap::default();
+    let mut updates = layer_map.batch_update();
     for i in 0..100_000 {
-        // TODO try inserting a super-wide layer in between every 10 to reflect
-        //      what often happens with L1 layers that include non-rel changes.
-        //      Maybe do that as a separate test.
         let i32 = (i as u32) % 100;
         let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
-        let layer = DummyImage {
-            key_range: zero.add(10 * i32)..zero.add(10 * i32 + 1),
-            lsn: Lsn(10 * i),
+        let layer = LayerDescriptor {
+            key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
+            lsn: Lsn(i)..Lsn(i + 1),
+            is_incremental: false,
+            short_id: format!("Layer {}", i),
         };
-        layer_map.insert_historic(Arc::new(layer));
+        updates.insert_historic(Arc::new(layer));
     }
-
-    // Manually measure runtime without criterion because criterion
-    // has a minimum sample size of 10 and I don't want to run it 10 times.
-    println!("Finished init in {:?}", now.elapsed());
+    updates.flush();
+    println!("Finished layer map init in {:?}", now.elapsed());
 
     // Choose 100 uniformly random queries
     let rng = &mut StdRng::seed_from_u64(1);
     let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map)
-        .choose_multiple(rng, 1)
+        .choose_multiple(rng, 100)
         .copied()
         .collect();
 
     // Define and name the benchmark function
-    c.bench_function("sequential_uniform_queries", |b| {
-        // Run the search queries
+    let mut group = c.benchmark_group("sequential");
+    group.bench_function("uniform_queries", |b| {
         b.iter(|| {
             for q in queries.clone().into_iter() {
                 layer_map.search(q.0, q.1);
             }
         });
     });
+    group.finish();
 }
 
 criterion_group!(group_1, bench_from_captest_env);
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index f1d92ac36b..06d4853274 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -27,6 +27,7 @@ use tracing::*;
 ///
 use tokio_tar::{Builder, EntryType, Header};
 
+use crate::context::RequestContext;
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};
 
@@ -52,6 +53,7 @@ pub async fn send_basebackup_tarball<'a, W>(
     req_lsn: Option<Lsn>,
     prev_lsn: Option<Lsn>,
     full_backup: bool,
+    ctx: &'a RequestContext,
 ) -> anyhow::Result<()>
 where
     W: AsyncWrite + Send + Sync + Unpin,
@@ -110,6 +112,7 @@ where
         lsn: backup_lsn,
         prev_record_lsn: prev_lsn,
         full_backup,
+        ctx,
     };
     basebackup
         .send_tarball()
@@ -129,6 +132,7 @@ where
     lsn: Lsn,
     prev_record_lsn: Lsn,
     full_backup: bool,
+    ctx: &'a RequestContext,
 }
 
 impl<'a, W> Basebackup<'a, W>
@@ -171,23 +175,37 @@ where
             SlruKind::MultiXactOffsets,
             SlruKind::MultiXactMembers,
         ] {
-            for segno in self.timeline.list_slru_segments(kind, self.lsn).await? {
+            for segno in self
+                .timeline
+                .list_slru_segments(kind, self.lsn, self.ctx)
+                .await?
+            {
                 self.add_slru_segment(kind, segno).await?;
             }
         }
 
         // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn).await? {
+        for ((spcnode, dbnode), has_relmap_file) in
+            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
+        {
             self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
 
             // Gather and send relational files in each database if full backup is requested.
             if self.full_backup {
-                for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn).await? {
+                for rel in self
+                    .timeline
+                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
+                    .await?
+                {
                     self.add_rel(rel).await?;
                 }
             }
         }
-        for xid in self.timeline.list_twophase_files(self.lsn).await? {
+        for xid in self
+            .timeline
+            .list_twophase_files(self.lsn, self.ctx)
+            .await?
+        {
             self.add_twophase_file(xid).await?;
         }
 
@@ -203,7 +221,10 @@ where
     }
 
     async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
-        let nblocks = self.timeline.get_rel_size(tag, self.lsn, false).await?;
+        let nblocks = self
+            .timeline
+            .get_rel_size(tag, self.lsn, false, self.ctx)
+            .await?;
 
         // If the relation is empty, create an empty file
         if nblocks == 0 {
@@ -223,7 +244,7 @@ where
             for blknum in startblk..endblk {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false)
+                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false, self.ctx)
                     .await?;
                 segment_data.extend_from_slice(&img[..]);
             }
@@ -245,14 +266,14 @@ where
     async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
         let nblocks = self
             .timeline
-            .get_slru_segment_size(slru, segno, self.lsn)
+            .get_slru_segment_size(slru, segno, self.lsn, self.ctx)
             .await?;
 
         let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
         for blknum in 0..nblocks {
             let img = self
                 .timeline
-                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)
+                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn, self.ctx)
                 .await?;
 
             if slru == SlruKind::Clog {
@@ -287,7 +308,7 @@ where
         let relmap_img = if has_relmap_file {
             let img = self
                 .timeline
-                .get_relmap_file(spcnode, dbnode, self.lsn)
+                .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
                 .await?;
             ensure!(img.len() == 512);
             Some(img)
@@ -323,7 +344,7 @@ where
             if !has_relmap_file
                 && self
                     .timeline
-                    .list_rels(spcnode, dbnode, self.lsn)
+                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
                     .await?
                     .is_empty()
             {
@@ -356,7 +377,10 @@ where
     // Extract twophase state files
     //
     async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
-        let img = self.timeline.get_twophase_file(xid, self.lsn).await?;
+        let img = self
+            .timeline
+            .get_twophase_file(xid, self.lsn, self.ctx)
+            .await?;
 
         let mut buf = BytesMut::new();
         buf.extend_from_slice(&img[..]);
@@ -394,12 +418,12 @@ where
 
         let checkpoint_bytes = self
             .timeline
-            .get_checkpoint(self.lsn)
+            .get_checkpoint(self.lsn, self.ctx)
             .await
             .context("failed to get checkpoint bytes")?;
         let pg_control_bytes = self
             .timeline
-            .get_control_file(self.lsn)
+            .get_control_file(self.lsn, self.ctx)
             .await
             .context("failed get control bytes")?;
 
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 5de6e4def5..f2cd93bd3a 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -13,6 +13,7 @@ use tracing::*;
 use metrics::set_build_info_metric;
 use pageserver::{
     config::{defaults::*, PageServerConf},
+    context::{DownloadBehavior, RequestContext},
     http, page_cache, page_service, task_mgr,
     task_mgr::TaskKind,
     task_mgr::{
@@ -26,7 +27,7 @@ use utils::{
     logging,
     postgres_backend::AuthType,
     project_git_version,
-    sentry_init::{init_sentry, release_name},
+    sentry_init::init_sentry,
     signals::{self, Signal},
     tcp_listener,
 };
@@ -85,7 +86,10 @@ fn main() -> anyhow::Result<()> {
     };
 
     // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.id.to_string())]);
+    let _sentry_guard = init_sentry(
+        Some(GIT_VERSION.into()),
+        &[("node_id", &conf.id.to_string())],
+    );
 
     let tenants_path = conf.tenants_path();
     if !tenants_path.exists() {
@@ -246,7 +250,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     let signals = signals::install_shutdown_handlers()?;
 
     // Launch broker client
-    WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;
+    WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?;
 
     // Initialize authentication for incoming connections
     let auth = match &conf.auth_type {
@@ -325,6 +329,13 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
         );
 
         if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+            let metrics_ctx = RequestContext::todo_child(
+                TaskKind::MetricsCollection,
+                // This task itself shouldn't download anything.
+                // The actual size calculation does need downloads, and
+                // creates a child context with the right DownloadBehavior.
+                DownloadBehavior::Error,
+            );
             task_mgr::spawn(
                 MGMT_REQUEST_RUNTIME.handle(),
                 TaskKind::MetricsCollection,
@@ -338,6 +349,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
                         conf.metric_collection_interval,
                         conf.synthetic_size_calculation_interval,
                         conf.id,
+                        metrics_ctx,
                     )
                     .instrument(info_span!("metrics_collection"))
                     .await?;
@@ -349,17 +361,34 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
 
     // Spawn a task to listen for libpq connections. It will spawn further tasks
     // for each connection. We created the listener earlier already.
-    task_mgr::spawn(
-        COMPUTE_REQUEST_RUNTIME.handle(),
-        TaskKind::LibpqEndpointListener,
-        None,
-        None,
-        "libpq endpoint listener",
-        true,
-        async move {
-            page_service::libpq_listener_main(conf, auth, pageserver_listener, conf.auth_type).await
-        },
-    );
+    {
+        let libpq_ctx = RequestContext::todo_child(
+            TaskKind::LibpqEndpointListener,
+            // listener task shouldn't need to download anything. (We will
+            // create a separate sub-contexts for each connection, with their
+            // own download behavior. This context is used only to listen and
+            // accept connections.)
+            DownloadBehavior::Error,
+        );
+        task_mgr::spawn(
+            COMPUTE_REQUEST_RUNTIME.handle(),
+            TaskKind::LibpqEndpointListener,
+            None,
+            None,
+            "libpq endpoint listener",
+            true,
+            async move {
+                page_service::libpq_listener_main(
+                    conf,
+                    auth,
+                    pageserver_listener,
+                    conf.auth_type,
+                    libpq_ctx,
+                )
+                .await
+            },
+        );
+    }
 
     // All started up! Now just sit and wait for shutdown signal.
     signals.handle(|signal| match signal {
diff --git a/pageserver/src/broker_client.rs b/pageserver/src/broker_client.rs
new file mode 100644
index 0000000000..6c92967ca3
--- /dev/null
+++ b/pageserver/src/broker_client.rs
@@ -0,0 +1,48 @@
+//! The broker client instance of the pageserver, created during pageserver startup.
+//! Used by each timelines' [`walreceiver`].
+
+use crate::config::PageServerConf;
+
+use anyhow::Context;
+use once_cell::sync::OnceCell;
+use storage_broker::BrokerClientChannel;
+use tracing::*;
+
+static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
+
+///
+/// Initialize the broker client. This must be called once at page server startup.
+///
+pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
+    let broker_endpoint = conf.broker_endpoint.clone();
+
+    // Note: we do not attempt connecting here (but validate endpoints sanity).
+    let broker_client =
+        storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
+            format!(
+                "Failed to create broker client to {}",
+                &conf.broker_endpoint
+            ),
+        )?;
+
+    if BROKER_CLIENT.set(broker_client).is_err() {
+        panic!("broker already initialized");
+    }
+
+    info!(
+        "Initialized broker client with endpoints: {}",
+        broker_endpoint
+    );
+    Ok(())
+}
+
+///
+/// Get a handle to the broker client
+///
+pub fn get_broker_client() -> &'static BrokerClientChannel {
+    BROKER_CLIENT.get().expect("broker client not initialized")
+}
+
+pub fn is_broker_client_initialized() -> bool {
+    BROKER_CLIENT.get().is_some()
+}
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 51d1664e52..a3b051279d 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -158,6 +158,8 @@ pub struct PageServerConf {
     pub synthetic_size_calculation_interval: Duration,
 
     pub test_remote_failures: u64,
+
+    pub ondemand_download_behavior_treat_error_as_warn: bool,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -222,6 +224,8 @@ struct PageServerConfigBuilder {
     synthetic_size_calculation_interval: BuilderValue<Duration>,
 
     test_remote_failures: BuilderValue<u64>,
+
+    ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -267,6 +271,8 @@ impl Default for PageServerConfigBuilder {
             metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
 
             test_remote_failures: Set(0),
+
+            ondemand_download_behavior_treat_error_as_warn: Set(false),
         }
     }
 }
@@ -363,6 +369,14 @@ impl PageServerConfigBuilder {
         self.test_remote_failures = BuilderValue::Set(fail_first);
     }
 
+    pub fn ondemand_download_behavior_treat_error_as_warn(
+        &mut self,
+        ondemand_download_behavior_treat_error_as_warn: bool,
+    ) {
+        self.ondemand_download_behavior_treat_error_as_warn =
+            BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         Ok(PageServerConf {
             listen_pg_addr: self
@@ -422,6 +436,11 @@ impl PageServerConfigBuilder {
             test_remote_failures: self
                 .test_remote_failures
                 .ok_or(anyhow!("missing test_remote_failuers"))?,
+            ondemand_download_behavior_treat_error_as_warn: self
+                .ondemand_download_behavior_treat_error_as_warn
+                .ok_or(anyhow!(
+                    "missing ondemand_download_behavior_treat_error_as_warn"
+                ))?,
         })
     }
 }
@@ -600,6 +619,7 @@ impl PageServerConf {
                 "synthetic_size_calculation_interval" =>
                     builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
                 "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
+                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -724,6 +744,7 @@ impl PageServerConf {
             metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
             synthetic_size_calculation_interval: Duration::from_secs(60),
             test_remote_failures: 0,
+            ondemand_download_behavior_treat_error_as_warn: false,
         }
     }
 }
@@ -749,6 +770,11 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
     Ok(i as u64)
 }
 
+fn parse_toml_bool(name: &str, item: &Item) -> Result<bool> {
+    item.as_bool()
+        .with_context(|| format!("configure option {name} is not a bool"))
+}
+
 fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
     let s = item
         .as_str()
@@ -907,6 +933,7 @@ log_format = 'json'
                     defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
                 )?,
                 test_remote_failures: 0,
+                ondemand_download_behavior_treat_error_as_warn: false,
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -954,6 +981,7 @@ log_format = 'json'
                 metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
                 synthetic_size_calculation_interval: Duration::from_secs(333),
                 test_remote_failures: 0,
+                ondemand_download_behavior_treat_error_as_warn: false,
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index c07026261d..d848ec5ee5 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,6 +3,7 @@
 //! and push them to a HTTP endpoint.
 //! Cache metrics to send only the updated ones.
 //!
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::mgr;
 use anyhow;
@@ -47,12 +48,15 @@ pub async fn collect_metrics(
     metric_collection_interval: Duration,
     synthetic_size_calculation_interval: Duration,
     node_id: NodeId,
+    ctx: RequestContext,
 ) -> anyhow::Result<()> {
     let mut ticker = tokio::time::interval(metric_collection_interval);
 
     info!("starting collect_metrics");
 
     // spin up background worker that caclulates tenant sizes
+    let worker_ctx =
+        ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::CalculateSyntheticSize,
@@ -61,7 +65,7 @@ pub async fn collect_metrics(
         "synthetic size calculation",
         false,
         async move {
-            calculate_synthetic_size_worker(synthetic_size_calculation_interval)
+            calculate_synthetic_size_worker(synthetic_size_calculation_interval, &worker_ctx)
                 .instrument(info_span!("synthetic_size_worker"))
                 .await?;
             Ok(())
@@ -79,7 +83,7 @@ pub async fn collect_metrics(
                 return Ok(());
             },
             _ = ticker.tick() => {
-                if let Err(err) = collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id).await
+                if let Err(err) = collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx).await
                 {
                     error!("metrics collection failed: {err:?}");
                 }
@@ -102,6 +106,7 @@ pub async fn collect_metrics_iteration(
     cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
     metric_collection_endpoint: &reqwest::Url,
     node_id: NodeId,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
     trace!(
@@ -110,7 +115,7 @@ pub async fn collect_metrics_iteration(
     );
 
     // get list of tenants
-    let tenants = mgr::list_tenants().await;
+    let tenants = mgr::list_tenants().await?;
 
     // iterate through list of Active tenants and collect metrics
     for (tenant_id, tenant_state) in tenants {
@@ -137,7 +142,7 @@ pub async fn collect_metrics_iteration(
                     timeline_written_size,
                 ));
 
-                let (timeline_logical_size, is_exact) = timeline.get_current_logical_size()?;
+                let (timeline_logical_size, is_exact) = timeline.get_current_logical_size(ctx)?;
                 // Only send timeline logical size when it is fully calculated.
                 if is_exact {
                     current_metrics.push((
@@ -258,6 +263,7 @@ pub async fn collect_metrics_iteration(
 /// Caclculate synthetic size for each active tenant
 pub async fn calculate_synthetic_size_worker(
     synthetic_size_calculation_interval: Duration,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     info!("starting calculate_synthetic_size_worker");
 
@@ -270,7 +276,13 @@ pub async fn calculate_synthetic_size_worker(
             },
         _ = ticker.tick() => {
 
-                let tenants = mgr::list_tenants().await;
+                let tenants = match mgr::list_tenants().await {
+                    Ok(tenants) => tenants,
+                    Err(e) => {
+                        warn!("cannot get tenant list: {e:#}");
+                        continue;
+                    }
+                };
                 // iterate through list of Active tenants and collect metrics
                 for (tenant_id, tenant_state) in tenants {
 
@@ -280,7 +292,7 @@ pub async fn calculate_synthetic_size_worker(
 
                     if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await
                     {
-                        if let Err(e) = tenant.calculate_synthetic_size().await {
+                        if let Err(e) = tenant.calculate_synthetic_size(ctx).await {
                             error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e);
                         }
                     }
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
new file mode 100644
index 0000000000..e826d28e6d
--- /dev/null
+++ b/pageserver/src/context.rs
@@ -0,0 +1,199 @@
+//! This module defines `RequestContext`, a structure that we use throughout
+//! the pageserver to propagate high-level context from places
+//! that _originate_ activity down to the shared code paths at the
+//! heart of the pageserver. It's inspired by Golang's `context.Context`.
+//!
+//! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions:
+//! 1. What high-level activity ([`TaskKind`]) needs this page?
+//!    We need that information as a categorical dimension for page access
+//!    statistics, which we, in turn, need to guide layer eviction policy design.
+//! 2. How should we behave if, to produce the page image, we need to
+//!    on-demand download a layer file ([`DownloadBehavior`]).
+//!
+//! [`RequestContext`] satisfies those needs.
+//! The current implementation is a small `struct` that is passed through
+//! the call chain by reference.
+//!
+//! ### Future Work
+//!
+//! However, we do not intend to stop here, since there are other needs that
+//! require carrying information from high to low levels of the app.
+//!
+//! Most importantly, **cancellation signaling** in response to
+//! 1. timeouts (page_service max response time) and
+//! 2. lifecycle requests (detach tenant, delete timeline).
+//!
+//! Related to that, there is sometimes a need to ensure that all tokio tasks spawned
+//! by the transitive callees of a request have finished. The keyword here
+//! is **Structured Concurrency**, and right now, we use `task_mgr` in most places,
+//! `TaskHandle` in some places, and careful code review around `FuturesUnordered`
+//! or `JoinSet` in other places.
+//!
+//! We do not yet have a systematic cancellation story in pageserver, and it is
+//! pretty clear that [`RequestContext`] will be responsible for that.
+//! So, the API already prepares for this role through the
+//! [`RequestContext::detached_child`] and [`RequestContext::attached_child`]  methods.
+//! See their doc comments for details on how we will use them in the future.
+//!
+//! It is not clear whether or how we will enforce Structured Concurrency, and
+//! what role [`RequestContext`] will play there.
+//! So, the API doesn't prepare us for this topic.
+//!
+//! Other future uses of `RequestContext`:
+//! - Communicate compute & IO priorities (user-initiated request vs. background-loop)
+//! - Request IDs for distributed tracing
+//! - Request/Timeline/Tenant-scoped log levels
+//!
+//! RequestContext might look quite different once it supports those features.
+//! Likely, it will have a shape similar to Golang's `context.Context`.
+//!
+//! ### Why A Struct Instead Of Method Parameters
+//!
+//! What's typical about such information is that it needs to be passed down
+//! along the call chain from high level to low level, but few of the functions
+//! in the middle need to understand it.
+//! Further, it is to be expected that we will need to propagate more data
+//! in the future (see the earlier section on future work).
+//! Hence, for functions in the middle of the call chain, we have the following
+//! requirements:
+//! 1. It should be easy to forward the context to callees.
+//! 2. To propagate more data from high-level to low-level code, the functions in
+//!    the middle should not need to be modified.
+//! The solution is to have a container structure ([`RequestContext`]) that
+//! carries the information. Functions that don't care about what's in it
+//! pass it along to callees.
+//!
+//! ### Why Not Task-Local Variables
+//!
+//! One could use task-local variables (the equivalent of thread-local variables)
+//! to address the immediate needs outlined above.
+//! However, we reject task-local variables because:
+//! 1. they are implicit, thereby making it harder to trace the data flow in code
+//!    reviews and during debugging,
+//! 2. they can be mutable, which enables implicit return data flow,
+//! 3. they are restrictive in that code which fans out into multiple tasks,
+//!    or even threads, needs to carefully propagate the state.
+//!
+//! In contrast, information flow with [`RequestContext`] is
+//! 1. always explicit,
+//! 2. strictly uni-directional because RequestContext is immutable,
+//! 3. tangible because a [`RequestContext`] is just a value.
+//!    When creating child activities, regardless of whether it's a task,
+//!    thread, or even an RPC to another service, the value can
+//!    be used like any other argument.
+//!
+//! The solution is that all code paths are infected with precisely one
+//! [`RequestContext`] argument. Functions in the middle of the call chain
+//! only need to pass it on.
+use crate::task_mgr::TaskKind;
+
+// The main structure of this module, see module-level comment.
+pub struct RequestContext {
+    task_kind: TaskKind,
+    download_behavior: DownloadBehavior,
+}
+
+/// Desired behavior if the operation requires an on-demand download
+/// to proceed.
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum DownloadBehavior {
+    /// Download the layer file. It can take a while.
+    Download,
+
+    /// Download the layer file, but print a warning to the log. This should be used
+    /// in code where the layer file is expected to already exist locally.
+    Warn,
+
+    /// Return a PageReconstructError::NeedsDownload error
+    Error,
+}
+
+impl RequestContext {
+    /// Create a new RequestContext that has no parent.
+    ///
+    /// The function is called `new` because, once we add children
+    /// to it using `detached_child` or `attached_child`, the context
+    /// form a tree (not implemented yet since cancellation will be
+    /// the first feature that requires a tree).
+    ///
+    /// # Future: Cancellation
+    ///
+    /// The only reason why a context like this one can be canceled is
+    /// because someone explicitly canceled it.
+    /// It has no parent, so it cannot inherit cancellation from there.
+    pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+        RequestContext {
+            task_kind,
+            download_behavior,
+        }
+    }
+
+    /// Create a detached child context for a task that may outlive `self`.
+    ///
+    /// Use this when spawning new background activity that should complete
+    /// even if the current request is canceled.
+    ///
+    /// # Future: Cancellation
+    ///
+    /// Cancellation of `self` will not propagate to the child context returned
+    /// by this method.
+    ///
+    /// # Future: Structured Concurrency
+    ///
+    /// We could add the Future as a parameter to this function, spawn it as a task,
+    /// and pass to the new task the child context as an argument.
+    /// That would be an ergonomic improvement.
+    ///
+    /// We could make new calls to this function fail if `self` is already canceled.
+    pub fn detached_child(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+        self.child_impl(task_kind, download_behavior)
+    }
+
+    /// Create a child of context `self` for a task that shall not outlive `self`.
+    ///
+    /// Use this when fanning-out work to other async tasks.
+    ///
+    /// # Future: Cancellation
+    ///
+    /// Cancelling a context will propagate to its attached children.
+    ///
+    /// # Future: Structured Concurrency
+    ///
+    /// We could add the Future as a parameter to this function, spawn it as a task,
+    /// and track its `JoinHandle` inside the `RequestContext`.
+    ///
+    /// We could then provide another method to allow waiting for all child tasks
+    /// to finish.
+    ///
+    /// We could make new calls to this function fail if `self` is already canceled.
+    /// Alternatively, we could allow the creation but not spawn the task.
+    /// The method to wait for child tasks would return an error, indicating
+    /// that the child task was not started because the context was canceled.
+    pub fn attached_child(&self) -> Self {
+        self.child_impl(self.task_kind(), self.download_behavior())
+    }
+
+    /// Use this function when you should be creating a child context using
+    /// [`attached_child`] or [`detached_child`], but your caller doesn't provide
+    /// a context and you are unwilling to change all callers to provide one.
+    ///
+    /// Before we add cancellation, we should get rid of this method.
+    pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+        Self::new(task_kind, download_behavior)
+    }
+
+    fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+        RequestContext {
+            task_kind,
+            download_behavior,
+        }
+    }
+
+    pub fn task_kind(&self) -> TaskKind {
+        self.task_kind
+    }
+
+    pub fn download_behavior(&self) -> DownloadBehavior {
+        self.download_behavior
+    }
+}
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index f9b8a81dad..23faff7ace 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -430,6 +430,13 @@ paths:
         schema:
           type: string
           format: hex
+      - name: inputs_only
+        in: query
+        required: false
+        schema:
+          type: boolean
+        description: |
+          When true, skip calculation and only provide the model inputs (for debugging). Defaults to false.
     get:
       description: |
         Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
@@ -449,8 +456,9 @@ paths:
                     format: hex
                   size:
                     type: integer
+                    nullable: true
                     description: |
-                      Size metric in bytes.
+                      Size metric in bytes or null if inputs_only=true was given.
         "401":
           description: Unauthorized Error
           content:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 1eb24c1507..a7802f3cbe 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -12,8 +12,11 @@ use super::models::{
     StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
     TimelineCreateRequest, TimelineInfo,
 };
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::pgdatadir_mapping::LsnForTimestamp;
+use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
+use crate::tenant::mgr::TenantMapInsertError;
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::{config::PageServerConf, tenant::mgr};
 use utils::{
@@ -81,18 +84,39 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
 fn apierror_from_prerror(err: PageReconstructError) -> ApiError {
     match err {
         PageReconstructError::Other(err) => ApiError::InternalServerError(err),
+        PageReconstructError::NeedsDownload(_, _) => {
+            // This shouldn't happen, because we use a RequestContext that requests to
+            // download any missing layer files on-demand.
+            ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
+        }
+        PageReconstructError::Cancelled => {
+            ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
+        }
         PageReconstructError::WalRedo(err) => {
             ApiError::InternalServerError(anyhow::Error::new(err))
         }
     }
 }
 
+fn apierror_from_tenant_map_insert_error(e: TenantMapInsertError) -> ApiError {
+    match e {
+        TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
+            ApiError::InternalServerError(anyhow::Error::new(e))
+        }
+        TenantMapInsertError::TenantAlreadyExists(id, state) => {
+            ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
+        }
+        TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e),
+    }
+}
+
 // Helper function to construct a TimelineInfo struct for a timeline
 async fn build_timeline_info(
     timeline: &Arc<Timeline>,
     include_non_incremental_logical_size: bool,
+    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
-    let mut info = build_timeline_info_common(timeline)?;
+    let mut info = build_timeline_info_common(timeline, ctx)?;
     if include_non_incremental_logical_size {
         // XXX we should be using spawn_ondemand_logical_size_calculation here.
         // Otherwise, if someone deletes the timeline / detaches the tenant while
@@ -102,6 +126,7 @@ async fn build_timeline_info(
                 .get_current_logical_size_non_incremental(
                     info.last_record_lsn,
                     CancellationToken::new(),
+                    ctx,
                 )
                 .await?,
         );
@@ -109,7 +134,10 @@ async fn build_timeline_info(
     Ok(info)
 }
 
-fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<TimelineInfo> {
+fn build_timeline_info_common(
+    timeline: &Arc<Timeline>,
+    ctx: &RequestContext,
+) -> anyhow::Result<TimelineInfo> {
     let last_record_lsn = timeline.get_last_record_lsn();
     let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
         let guard = timeline.last_received_wal.lock().unwrap();
@@ -129,7 +157,7 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
         Lsn(0) => None,
         lsn @ Lsn(_) => Some(lsn),
     };
-    let current_logical_size = match timeline.get_current_logical_size() {
+    let current_logical_size = match timeline.get_current_logical_size(ctx) {
         Ok((size, _)) => Some(size),
         Err(err) => {
             error!("Timeline info creation failed to get current logical size: {err:?}");
@@ -180,6 +208,8 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
         .new_timeline_id
         .unwrap_or_else(TimelineId::generate);
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
+
     let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
@@ -187,13 +217,14 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
         new_timeline_id,
         request_data.ancestor_timeline_id.map(TimelineId::from),
         request_data.ancestor_start_lsn,
-        request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION)
+        request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
+        &ctx,
     )
     .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
     .await {
         Ok(Some(new_timeline)) => {
             // Created. Construct a TimelineInfo for it.
-            let timeline_info = build_timeline_info_common(&new_timeline)
+            let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
                 .map_err(ApiError::InternalServerError)?;
             json_response(StatusCode::CREATED, timeline_info)
         }
@@ -208,6 +239,8 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
         query_param_present(&request, "include-non-incremental-logical-size");
     check_permission(&request, Some(tenant_id))?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+
     let response_data = async {
         let tenant = mgr::get_tenant(tenant_id, true)
             .await
@@ -217,7 +250,7 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
         let mut response_data = Vec::with_capacity(timelines.len());
         for timeline in timelines {
             let timeline_info =
-                build_timeline_info(&timeline, include_non_incremental_logical_size)
+                build_timeline_info(&timeline, include_non_incremental_logical_size, &ctx)
                     .await
                     .context(
                         "Failed to convert tenant timeline {timeline_id} into the local one: {e:?}",
@@ -239,11 +272,7 @@ fn query_param_present(request: &Request<Body>, param: &str) -> bool {
     request
         .uri()
         .query()
-        .map(|v| {
-            url::form_urlencoded::parse(v.as_bytes())
-                .into_owned()
-                .any(|(p, _)| p == param)
-        })
+        .map(|v| url::form_urlencoded::parse(v.as_bytes()).any(|(p, _)| p == param))
         .unwrap_or(false)
 }
 
@@ -252,13 +281,12 @@ fn get_query_param(request: &Request<Body>, param_name: &str) -> Result<String,
         Err(ApiError::BadRequest(anyhow!("empty query in request"))),
         |v| {
             url::form_urlencoded::parse(v.as_bytes())
-                .into_owned()
                 .find(|(k, _)| k == param_name)
                 .map_or(
                     Err(ApiError::BadRequest(anyhow!(
                         "no {param_name} specified in query parameters"
                     ))),
-                    |(_, v)| Ok(v),
+                    |(_, v)| Ok(v.into_owned()),
                 )
         },
     )
@@ -271,6 +299,9 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
         query_param_present(&request, "include-non-incremental-logical-size");
     check_permission(&request, Some(tenant_id))?;
 
+    // Logical size calculation needs downloading.
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+
     let timeline_info = async {
         let tenant = mgr::get_tenant(tenant_id, true)
             .await
@@ -280,10 +311,11 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
             .get_timeline(timeline_id, false)
             .map_err(ApiError::NotFound)?;
 
-        let timeline_info = build_timeline_info(&timeline, include_non_incremental_logical_size)
-            .await
-            .context("Failed to get local timeline info: {e:#}")
-            .map_err(ApiError::InternalServerError)?;
+        let timeline_info =
+            build_timeline_info(&timeline, include_non_incremental_logical_size, &ctx)
+                .await
+                .context("get local timeline info")
+                .map_err(ApiError::InternalServerError)?;
 
         Ok::<_, ApiError>(timeline_info)
     }
@@ -304,12 +336,13 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
         .map_err(ApiError::BadRequest)?;
     let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
     let timeline = mgr::get_tenant(tenant_id, true)
         .await
         .and_then(|tenant| tenant.get_timeline(timeline_id, true))
         .map_err(ApiError::NotFound)?;
     let result = timeline
-        .find_lsn_for_timestamp(timestamp_pg)
+        .find_lsn_for_timestamp(timestamp_pg, &ctx)
         .await
         .map_err(apierror_from_prerror)?;
 
@@ -327,16 +360,17 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
     info!("Handling tenant attach {tenant_id}");
 
     let state = get_state(&request);
 
     if let Some(remote_storage) = &state.remote_storage {
-        // FIXME: distinguish between "Tenant already exists" and other errors
-        mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
+        mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone(), &ctx)
             .instrument(info_span!("tenant_attach", tenant = %tenant_id))
             .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(apierror_from_tenant_map_insert_error)?;
     } else {
         return Err(ApiError::BadRequest(anyhow!(
             "attach_tenant is not possible because pageserver was configured without remote storage"
@@ -351,7 +385,9 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    mgr::delete_timeline(tenant_id, timeline_id)
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    mgr::delete_timeline(tenant_id, timeline_id, &ctx)
         .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
         .await
         // FIXME: Errors from `delete_timeline` can occur for a number of reasons, incuding both
@@ -382,11 +418,13 @@ async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, A
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
     let state = get_state(&request);
-    mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
+    mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx)
         .instrument(info_span!("load", tenant = %tenant_id))
         .await
-        .map_err(ApiError::InternalServerError)?;
+        .map_err(apierror_from_tenant_map_insert_error)?;
 
     json_response(StatusCode::ACCEPTED, ())
 }
@@ -413,6 +451,8 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
     let response_data = mgr::list_tenants()
         .instrument(info_span!("tenant_list"))
         .await
+        .map_err(anyhow::Error::new)
+        .map_err(ApiError::InternalServerError)?
         .iter()
         .map(|(id, state)| TenantInfo {
             id: *id,
@@ -453,21 +493,40 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
     json_response(StatusCode::OK, tenant_info)
 }
 
+/// HTTP endpoint to query the current tenant_size of a tenant.
+///
+/// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
+/// to debug any of the calculations. Requires `tenant_id` request parameter, supports
+/// `inputs_only=true|false` (default false) which supports debugging failure to calculate model
+/// values.
 async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
+    let inputs_only = if query_param_present(&request, "inputs_only") {
+        get_query_param(&request, "inputs_only")?
+            .parse()
+            .map_err(|_| ApiError::BadRequest(anyhow!("failed to parse inputs_only")))?
+    } else {
+        false
+    };
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
     let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::InternalServerError)?;
 
-    // this can be long operation, it currently is not backed by any request coalescing or similar
+    // this can be long operation
     let inputs = tenant
-        .gather_size_inputs()
+        .gather_size_inputs(&ctx)
         .await
         .map_err(ApiError::InternalServerError)?;
 
-    let size = inputs.calculate().map_err(ApiError::InternalServerError)?;
+    let size = if !inputs_only {
+        Some(inputs.calculate().map_err(ApiError::InternalServerError)?)
+    } else {
+        None
+    };
 
     /// Private response type with the additional "unstable" `inputs` field.
     ///
@@ -479,7 +538,9 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
         #[serde_as(as = "serde_with::DisplayFromStr")]
         id: TenantId,
         /// Size is a mixture of WAL and logical size, so the unit is bytes.
-        size: u64,
+        ///
+        /// Will be none if `?inputs_only=true` was given.
+        size: Option<u64>,
         inputs: crate::tenant::size::ModelInputs,
     }
 
@@ -506,6 +567,8 @@ fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn()
 async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
     let request_data: TenantCreateRequest = json_request(&mut request).await?;
 
     let mut tenant_conf = TenantConfOpt::default();
@@ -583,34 +646,28 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
         tenant_conf,
         target_tenant_id,
         state.remote_storage.clone(),
+        &ctx,
     )
     .instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
     .await
-    // FIXME: `create_tenant` can fail from both user and internal errors. Replace this
-    // with better error handling once the type permits it
-    .map_err(ApiError::InternalServerError)?;
+    .map_err(apierror_from_tenant_map_insert_error)?;
 
-    Ok(match new_tenant {
-        Some(tenant) => {
-            // We created the tenant. Existing API semantics are that the tenant
-            // is Active when this function returns.
-            if let res @ Err(_) = tenant.wait_to_become_active().await {
-                // This shouldn't happen because we just created the tenant directory
-                // in tenant::mgr::create_tenant, and there aren't any remote timelines
-                // to load, so, nothing can really fail during load.
-                // Don't do cleanup because we don't know how we got here.
-                // The tenant will likely be in `Broken` state and subsequent
-                // calls will fail.
-                res.context("created tenant failed to become active")
-                    .map_err(ApiError::InternalServerError)?;
-            }
-            json_response(
-                StatusCode::CREATED,
-                TenantCreateResponse(tenant.tenant_id()),
-            )?
-        }
-        None => json_response(StatusCode::CONFLICT, ())?,
-    })
+    // We created the tenant. Existing API semantics are that the tenant
+    // is Active when this function returns.
+    if let res @ Err(_) = new_tenant.wait_to_become_active().await {
+        // This shouldn't happen because we just created the tenant directory
+        // in tenant::mgr::create_tenant, and there aren't any remote timelines
+        // to load, so, nothing can really fail during load.
+        // Don't do cleanup because we don't know how we got here.
+        // The tenant will likely be in `Broken` state and subsequent
+        // calls will fail.
+        res.context("created tenant failed to become active")
+            .map_err(ApiError::InternalServerError)?;
+    }
+    json_response(
+        StatusCode::CREATED,
+        TenantCreateResponse(new_tenant.tenant_id()),
+    )
 }
 
 async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -732,7 +789,8 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
 
     let gc_req: TimelineGcRequest = json_request(&mut request).await?;
 
-    let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, &ctx).await?;
     let gc_result = wait_task_done
         .await
         .context("wait for gc task")
@@ -749,7 +807,8 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let result_receiver = mgr::immediate_compact(tenant_id, timeline_id)
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let result_receiver = mgr::immediate_compact(tenant_id, timeline_id, &ctx)
         .await
         .context("spawn compaction task")
         .map_err(ApiError::InternalServerError)?;
@@ -770,6 +829,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
     let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
@@ -781,7 +841,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
         .await
         .map_err(ApiError::InternalServerError)?;
     timeline
-        .compact()
+        .compact(&ctx)
         .await
         .map_err(ApiError::InternalServerError)?;
 
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 3fd4bf12ca..39e434a023 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -12,6 +12,7 @@ use tokio_tar::Archive;
 use tracing::*;
 use walkdir::WalkDir;
 
+use crate::context::RequestContext;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
@@ -47,6 +48,7 @@ pub async fn import_timeline_from_postgres_datadir(
     tline: &Timeline,
     pgdata_path: &Path,
     pgdata_lsn: Lsn,
+    ctx: &RequestContext,
 ) -> Result<()> {
     let mut pg_control: Option<ControlFileData> = None;
 
@@ -69,7 +71,7 @@ pub async fn import_timeline_from_postgres_datadir(
             let mut file = tokio::fs::File::open(absolute_path).await?;
             let len = metadata.len() as usize;
             if let Some(control_file) =
-                import_file(&mut modification, relative_path, &mut file, len).await?
+                import_file(&mut modification, relative_path, &mut file, len, ctx).await?
             {
                 pg_control = Some(control_file);
             }
@@ -99,6 +101,7 @@ pub async fn import_timeline_from_postgres_datadir(
         tline,
         Lsn(pg_control.checkPointCopy.redo),
         pgdata_lsn,
+        ctx,
     )
     .await?;
 
@@ -113,6 +116,7 @@ async fn import_rel(
     dboid: Oid,
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     // Does it look like a relation file?
     trace!("importing rel file {}", path.display());
@@ -147,7 +151,10 @@ async fn import_rel(
     // FIXME: use proper error type for this, instead of parsing the error message.
     // Or better yet, keep track of which relations we've already created
     // https://github.com/neondatabase/neon/issues/3309
-    if let Err(e) = modification.put_rel_creation(rel, nblocks as u32).await {
+    if let Err(e) = modification
+        .put_rel_creation(rel, nblocks as u32, ctx)
+        .await
+    {
         if e.to_string().contains("already exists") {
             debug!("relation {} already exists. we must be extending it", rel);
         } else {
@@ -182,7 +189,7 @@ async fn import_rel(
     //
     // If we process rel segments out of order,
     // put_rel_extend will skip the update.
-    modification.put_rel_extend(rel, blknum).await?;
+    modification.put_rel_extend(rel, blknum, ctx).await?;
 
     Ok(())
 }
@@ -195,6 +202,7 @@ async fn import_slru(
     path: &Path,
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     info!("importing slru file {path:?}");
 
@@ -211,7 +219,7 @@ async fn import_slru(
     ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize);
 
     modification
-        .put_slru_segment_creation(slru, segno, nblocks as u32)
+        .put_slru_segment_creation(slru, segno, nblocks as u32, ctx)
         .await?;
 
     let mut rpageno = 0;
@@ -252,15 +260,15 @@ async fn import_wal(
     tline: &Timeline,
     startpoint: Lsn,
     endpoint: Lsn,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
-    use std::io::Read;
     let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
 
     let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE);
     let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = startpoint;
 
-    let mut walingest = WalIngest::new(tline, startpoint).await?;
+    let mut walingest = WalIngest::new(tline, startpoint, ctx).await?;
 
     while last_lsn <= endpoint {
         // FIXME: assume postgresql tli 1 for now
@@ -283,6 +291,7 @@ async fn import_wal(
             file.seek(std::io::SeekFrom::Start(offset as u64))?;
         }
 
+        use std::io::Read;
         let nread = file.read_to_end(&mut buf)?;
         if nread != WAL_SEGMENT_SIZE - offset {
             // Maybe allow this for .partial files?
@@ -297,7 +306,7 @@ async fn import_wal(
         while last_lsn <= endpoint {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                     .await?;
                 last_lsn = lsn;
 
@@ -326,6 +335,7 @@ pub async fn import_basebackup_from_tar(
     tline: &Timeline,
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     base_lsn: Lsn,
+    ctx: &RequestContext,
 ) -> Result<()> {
     info!("importing base at {base_lsn}");
     let mut modification = tline.begin_modification(base_lsn);
@@ -344,7 +354,7 @@ pub async fn import_basebackup_from_tar(
         match header.entry_type() {
             tokio_tar::EntryType::Regular => {
                 if let Some(res) =
-                    import_file(&mut modification, file_path.as_ref(), &mut entry, len).await?
+                    import_file(&mut modification, file_path.as_ref(), &mut entry, len, ctx).await?
                 {
                     // We found the pg_control file.
                     pg_control = Some(res);
@@ -376,13 +386,14 @@ pub async fn import_wal_from_tar(
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     start_lsn: Lsn,
     end_lsn: Lsn,
+    ctx: &RequestContext,
 ) -> Result<()> {
     // Set up walingest mutable state
     let mut waldecoder = WalStreamDecoder::new(start_lsn, tline.pg_version);
     let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE);
     let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = start_lsn;
-    let mut walingest = WalIngest::new(tline, start_lsn).await?;
+    let mut walingest = WalIngest::new(tline, start_lsn, ctx).await?;
 
     // Ingest wal until end_lsn
     info!("importing wal until {}", end_lsn);
@@ -431,7 +442,7 @@ pub async fn import_wal_from_tar(
         while last_lsn <= end_lsn {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                     .await?;
                 last_lsn = lsn;
 
@@ -466,6 +477,7 @@ async fn import_file(
     file_path: &Path,
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
+    ctx: &RequestContext,
 ) -> Result<Option<ControlFileData>> {
     let file_name = match file_path.file_name() {
         Some(name) => name.to_string_lossy(),
@@ -498,14 +510,16 @@ async fn import_file(
             }
             "pg_filenode.map" => {
                 let bytes = read_all_bytes(reader).await?;
-                modification.put_relmap_file(spcnode, dbnode, bytes).await?;
+                modification
+                    .put_relmap_file(spcnode, dbnode, bytes, ctx)
+                    .await?;
                 debug!("imported relmap file")
             }
             "PG_VERSION" => {
                 debug!("ignored PG_VERSION file");
             }
             _ => {
-                import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
+                import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?;
                 debug!("imported rel creation");
             }
         }
@@ -521,38 +535,40 @@ async fn import_file(
         match file_name.as_ref() {
             "pg_filenode.map" => {
                 let bytes = read_all_bytes(reader).await?;
-                modification.put_relmap_file(spcnode, dbnode, bytes).await?;
+                modification
+                    .put_relmap_file(spcnode, dbnode, bytes, ctx)
+                    .await?;
                 debug!("imported relmap file")
             }
             "PG_VERSION" => {
                 debug!("ignored PG_VERSION file");
             }
             _ => {
-                import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
+                import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?;
                 debug!("imported rel creation");
             }
         }
     } else if file_path.starts_with("pg_xact") {
         let slru = SlruKind::Clog;
 
-        import_slru(modification, slru, file_path, reader, len).await?;
+        import_slru(modification, slru, file_path, reader, len, ctx).await?;
         debug!("imported clog slru");
     } else if file_path.starts_with("pg_multixact/offsets") {
         let slru = SlruKind::MultiXactOffsets;
 
-        import_slru(modification, slru, file_path, reader, len).await?;
+        import_slru(modification, slru, file_path, reader, len, ctx).await?;
         debug!("imported multixact offsets slru");
     } else if file_path.starts_with("pg_multixact/members") {
         let slru = SlruKind::MultiXactMembers;
 
-        import_slru(modification, slru, file_path, reader, len).await?;
+        import_slru(modification, slru, file_path, reader, len, ctx).await?;
         debug!("imported multixact members slru");
     } else if file_path.starts_with("pg_twophase") {
         let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
 
         let bytes = read_all_bytes(reader).await?;
         modification
-            .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))
+            .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]), ctx)
             .await?;
         debug!("imported twophase file");
     } else if file_path.starts_with("pg_wal") {
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 91cde477ad..09e21ae755 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,7 +1,9 @@
 mod auth;
 pub mod basebackup;
+pub mod broker_client;
 pub mod config;
 pub mod consumption_metrics;
+pub mod context;
 pub mod http;
 pub mod import_datadir;
 pub mod keyspace;
@@ -15,7 +17,6 @@ pub mod tenant;
 pub mod trace;
 pub mod virtual_file;
 pub mod walingest;
-pub mod walreceiver;
 pub mod walrecord;
 pub mod walredo;
 
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index b61e64048b..6bd0eddbb5 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,10 +1,12 @@
 use metrics::core::{AtomicU64, GenericCounter};
 use metrics::{
-    register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec,
-    register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, Histogram, HistogramVec,
-    IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
+    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
+    Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
+    UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
+use pageserver_api::models::state;
 use utils::id::{TenantId, TimelineId};
 
 /// Prometheus histogram buckets (in seconds) that capture the majority of
@@ -35,11 +37,29 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[
     "gc",
 ];
 
-pub static STORAGE_TIME: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "pageserver_storage_operations_seconds",
-        "Time spent on storage operations",
+pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
+    register_counter_vec!(
+        "pageserver_storage_operations_seconds_sum",
+        "Total time spent on storage operations with operation, tenant and timeline dimensions",
         &["operation", "tenant_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
+pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_storage_operations_seconds_count",
+        "Count of storage operations with operation, tenant and timeline dimensions",
+        &["operation", "tenant_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
+pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_storage_operations_seconds_global",
+        "Time spent on storage operations",
+        &["operation"],
         get_buckets_for_critical_operations(),
     )
     .expect("failed to define a metric")
@@ -112,6 +132,24 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define current logical size metric")
 });
 
+// Metrics collected on tenant states.
+const TENANT_STATE_OPTIONS: &[&str] = &[
+    state::LOADING,
+    state::ATTACHING,
+    state::ACTIVE,
+    state::STOPPING,
+    state::BROKEN,
+];
+
+pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_tenant_states_count",
+        "Count of tenants per state",
+        &["tenant_id", "state"]
+    )
+    .expect("Failed to register pageserver_tenant_states_count metric")
+});
+
 // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
 // or in testing they estimate how much we would upload if we did.
 static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -375,18 +413,81 @@ pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
     .unwrap()
 });
 
+/// Similar to [`prometheus::HistogramTimer`] but does not record on drop.
+pub struct StorageTimeMetricsTimer {
+    metrics: StorageTimeMetrics,
+    start: Instant,
+}
+
+impl StorageTimeMetricsTimer {
+    fn new(metrics: StorageTimeMetrics) -> Self {
+        Self {
+            metrics,
+            start: Instant::now(),
+        }
+    }
+
+    /// Record the time from creation to now.
+    pub fn stop_and_record(self) {
+        let duration = self.start.elapsed().as_secs_f64();
+        self.metrics.timeline_sum.inc_by(duration);
+        self.metrics.timeline_count.inc();
+        self.metrics.global_histogram.observe(duration);
+    }
+}
+
+/// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
+/// timeline total sum and count.
+#[derive(Clone, Debug)]
+pub struct StorageTimeMetrics {
+    /// Sum of f64 seconds, per operation, tenant_id and timeline_id
+    timeline_sum: Counter,
+    /// Number of oeprations, per operation, tenant_id and timeline_id
+    timeline_count: IntCounter,
+    /// Global histogram having only the "operation" label.
+    global_histogram: Histogram,
+}
+
+impl StorageTimeMetrics {
+    pub fn new(operation: &str, tenant_id: &str, timeline_id: &str) -> Self {
+        let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
+            .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
+            .unwrap();
+        let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
+            .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
+            .unwrap();
+        let global_histogram = STORAGE_TIME_GLOBAL
+            .get_metric_with_label_values(&[operation])
+            .unwrap();
+
+        StorageTimeMetrics {
+            timeline_sum,
+            timeline_count,
+            global_histogram,
+        }
+    }
+
+    /// Starts timing a new operation.
+    ///
+    /// Note: unlike [`prometheus::HistogramTimer`] the returned timer does not record on drop.
+    pub fn start_timer(&self) -> StorageTimeMetricsTimer {
+        StorageTimeMetricsTimer::new(self.clone())
+    }
+}
+
 #[derive(Debug)]
 pub struct TimelineMetrics {
     tenant_id: String,
     timeline_id: String,
     pub reconstruct_time_histo: Histogram,
     pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
-    pub flush_time_histo: Histogram,
-    pub compact_time_histo: Histogram,
-    pub create_images_time_histo: Histogram,
-    pub init_logical_size_histo: Histogram,
-    pub logical_size_histo: Histogram,
-    pub load_layer_map_histo: Histogram,
+    pub flush_time_histo: StorageTimeMetrics,
+    pub compact_time_histo: StorageTimeMetrics,
+    pub create_images_time_histo: StorageTimeMetrics,
+    pub init_logical_size_histo: StorageTimeMetrics,
+    pub logical_size_histo: StorageTimeMetrics,
+    pub load_layer_map_histo: StorageTimeMetrics,
+    pub garbage_collect_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
     pub wait_lsn_time_histo: Histogram,
     pub resident_physical_size_gauge: UIntGauge,
@@ -406,24 +507,16 @@ impl TimelineMetrics {
         let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
-        let flush_time_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["layer flush", &tenant_id, &timeline_id])
-            .unwrap();
-        let compact_time_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["compact", &tenant_id, &timeline_id])
-            .unwrap();
-        let create_images_time_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["create images", &tenant_id, &timeline_id])
-            .unwrap();
-        let init_logical_size_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id])
-            .unwrap();
-        let logical_size_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["logical size", &tenant_id, &timeline_id])
-            .unwrap();
-        let load_layer_map_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id])
-            .unwrap();
+        let flush_time_histo = StorageTimeMetrics::new("layer flush", &tenant_id, &timeline_id);
+        let compact_time_histo = StorageTimeMetrics::new("compact", &tenant_id, &timeline_id);
+        let create_images_time_histo =
+            StorageTimeMetrics::new("create images", &tenant_id, &timeline_id);
+        let init_logical_size_histo =
+            StorageTimeMetrics::new("init logical size", &tenant_id, &timeline_id);
+        let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id);
+        let load_layer_map_histo =
+            StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id);
+        let garbage_collect_histo = StorageTimeMetrics::new("gc", &tenant_id, &timeline_id);
         let last_record_gauge = LAST_RECORD_LSN
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
@@ -453,6 +546,7 @@ impl TimelineMetrics {
             create_images_time_histo,
             init_logical_size_histo,
             logical_size_histo,
+            garbage_collect_histo,
             load_layer_map_histo,
             last_record_gauge,
             wait_lsn_time_histo,
@@ -478,7 +572,10 @@ impl Drop for TimelineMetrics {
         let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
 
         for op in STORAGE_TIME_OPERATIONS {
-            let _ = STORAGE_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
+            let _ =
+                STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
+            let _ =
+                STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
         }
         for op in STORAGE_IO_TIME_OPERATIONS {
             let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
@@ -495,7 +592,10 @@ impl Drop for TimelineMetrics {
 }
 
 pub fn remove_tenant_metrics(tenant_id: &TenantId) {
-    let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]);
+    let tid = tenant_id.to_string();
+    for state in TENANT_STATE_OPTIONS {
+        let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
+    }
 }
 
 use futures::Future;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 344a8d1c00..878928ae06 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -13,6 +13,7 @@ use anyhow::Context;
 use bytes::Buf;
 use bytes::Bytes;
 use futures::{Stream, StreamExt};
+use pageserver_api::models::TenantState;
 use pageserver_api::models::{
     PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
     PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
@@ -30,19 +31,19 @@ use std::sync::Arc;
 use std::time::Duration;
 use tracing::*;
 use utils::id::ConnectionId;
-use utils::postgres_backend_async::QueryError;
 use utils::{
     auth::{Claims, JwtAuth, Scope},
     id::{TenantId, TimelineId},
     lsn::Lsn,
     postgres_backend::AuthType,
-    postgres_backend_async::{self, PostgresBackend},
+    postgres_backend_async::{self, is_expected_io_error, PostgresBackend, QueryError},
     simple_rcu::RcuReadGuard,
 };
 
 use crate::auth::check_permission;
 use crate::basebackup;
 use crate::config::PageServerConf;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::task_mgr;
@@ -123,6 +124,7 @@ pub async fn libpq_listener_main(
     auth: Option<Arc<JwtAuth>>,
     listener: TcpListener,
     auth_type: AuthType,
+    listener_ctx: RequestContext,
 ) -> anyhow::Result<()> {
     listener.set_nonblocking(true)?;
     let tokio_listener = tokio::net::TcpListener::from_std(listener)?;
@@ -146,6 +148,9 @@ pub async fn libpq_listener_main(
                 debug!("accepted connection from {}", peer_addr);
                 let local_auth = auth.clone();
 
+                let connection_ctx = listener_ctx
+                    .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download);
+
                 // PageRequestHandler tasks are not associated with any particular
                 // timeline in the task manager. In practice most connections will
                 // only deal with a particular timeline, but we don't know which one
@@ -157,7 +162,7 @@ pub async fn libpq_listener_main(
                     None,
                     "serving compute connection task",
                     false,
-                    page_service_conn_main(conf, local_auth, socket, auth_type),
+                    page_service_conn_main(conf, local_auth, socket, auth_type, connection_ctx),
                 );
             }
             Err(err) => {
@@ -177,6 +182,7 @@ async fn page_service_conn_main(
     auth: Option<Arc<JwtAuth>>,
     socket: tokio::net::TcpStream,
     auth_type: AuthType,
+    connection_ctx: RequestContext,
 ) -> anyhow::Result<()> {
     // Immediately increment the gauge, then create a job to decrement it on task exit.
     // One of the pros of `defer!` is that this will *most probably*
@@ -191,24 +197,24 @@ async fn page_service_conn_main(
         .set_nodelay(true)
         .context("could not set TCP_NODELAY")?;
 
-    let mut conn_handler = PageServerHandler::new(conf, auth);
+    // XXX: pgbackend.run() should take the connection_ctx,
+    // and create a child per-query context when it invokes process_query.
+    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
+    // and create the per-query context in process_query ourselves.
+    let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
     let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
 
-    let result = pgbackend
+    match pgbackend
         .run(&mut conn_handler, task_mgr::shutdown_watcher)
-        .await;
-    match result {
+        .await
+    {
         Ok(()) => {
             // we've been requested to shut down
             Ok(())
         }
         Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
-            // `ConnectionReset` error happens when the Postgres client closes the connection.
-            // As this disconnection happens quite often and is expected,
-            // we decided to downgrade the logging level to `INFO`.
-            // See: https://github.com/neondatabase/neon/issues/1683.
-            if io_error.kind() == io::ErrorKind::ConnectionReset {
-                info!("Postgres client disconnected");
+            if is_expected_io_error(&io_error) {
+                info!("Postgres client disconnected ({io_error})");
                 Ok(())
             } else {
                 Err(io_error).context("Postgres connection error")
@@ -255,30 +261,42 @@ struct PageServerHandler {
     _conf: &'static PageServerConf,
     auth: Option<Arc<JwtAuth>>,
     claims: Option<Claims>,
+
+    /// The context created for the lifetime of the connection
+    /// services by this PageServerHandler.
+    /// For each query received over the connection,
+    /// `process_query` creates a child context from this one.
+    connection_ctx: RequestContext,
 }
 
 impl PageServerHandler {
-    pub fn new(conf: &'static PageServerConf, auth: Option<Arc<JwtAuth>>) -> Self {
+    pub fn new(
+        conf: &'static PageServerConf,
+        auth: Option<Arc<JwtAuth>>,
+        connection_ctx: RequestContext,
+    ) -> Self {
         PageServerHandler {
             _conf: conf,
             auth,
             claims: None,
+            connection_ctx,
         }
     }
 
-    #[instrument(skip(self, pgb))]
+    #[instrument(skip(self, pgb, ctx))]
     async fn handle_pagerequests(
         &self,
         pgb: &mut PostgresBackend,
         tenant_id: TenantId,
         timeline_id: TimelineId,
+        ctx: RequestContext,
     ) -> anyhow::Result<()> {
         // NOTE: pagerequests handler exits when connection is closed,
         //       so there is no need to reset the association
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
 
         // Make request tracer if needed
-        let tenant = get_active_tenant_with_timeout(tenant_id).await?;
+        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
         let mut tracer = if tenant.get_trace_read_requests() {
             let connection_id = ConnectionId::generate();
             let path = tenant
@@ -329,22 +347,27 @@ impl PageServerHandler {
 
             let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
 
+            // TODO: We could create a new per-request context here, with unique ID.
+            // Currently we use the same per-timeline context for all requests
+
             let response = match neon_fe_msg {
                 PagestreamFeMessage::Exists(req) => {
                     let _timer = metrics.get_rel_exists.start_timer();
-                    self.handle_get_rel_exists_request(&timeline, &req).await
+                    self.handle_get_rel_exists_request(&timeline, &req, &ctx)
+                        .await
                 }
                 PagestreamFeMessage::Nblocks(req) => {
                     let _timer = metrics.get_rel_size.start_timer();
-                    self.handle_get_nblocks_request(&timeline, &req).await
+                    self.handle_get_nblocks_request(&timeline, &req, &ctx).await
                 }
                 PagestreamFeMessage::GetPage(req) => {
                     let _timer = metrics.get_page_at_lsn.start_timer();
-                    self.handle_get_page_at_lsn_request(&timeline, &req).await
+                    self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
+                        .await
                 }
                 PagestreamFeMessage::DbSize(req) => {
                     let _timer = metrics.get_db_size.start_timer();
-                    self.handle_db_size_request(&timeline, &req).await
+                    self.handle_db_size_request(&timeline, &req, &ctx).await
                 }
             };
 
@@ -363,7 +386,8 @@ impl PageServerHandler {
         Ok(())
     }
 
-    #[instrument(skip(self, pgb))]
+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip(self, pgb, ctx))]
     async fn handle_import_basebackup(
         &self,
         pgb: &mut PostgresBackend,
@@ -372,12 +396,13 @@ impl PageServerHandler {
         base_lsn: Lsn,
         _end_lsn: Lsn,
         pg_version: u32,
+        ctx: RequestContext,
     ) -> Result<(), QueryError> {
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
         // Create empty timeline
         info!("creating new timeline");
-        let tenant = get_active_tenant_with_timeout(tenant_id).await?;
-        let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version)?;
+        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
+        let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)?;
 
         // TODO mark timeline as not ready until it reaches end_lsn.
         // We might have some wal to import as well, and we should prevent compute
@@ -396,7 +421,7 @@ impl PageServerHandler {
 
         let mut copyin_stream = Box::pin(copyin_stream(pgb));
         timeline
-            .import_basebackup_from_tar(&mut copyin_stream, base_lsn)
+            .import_basebackup_from_tar(&mut copyin_stream, base_lsn, &ctx)
             .await?;
 
         // Drain the rest of the Copy data
@@ -418,7 +443,7 @@ impl PageServerHandler {
         Ok(())
     }
 
-    #[instrument(skip(self, pgb))]
+    #[instrument(skip(self, pgb, ctx))]
     async fn handle_import_wal(
         &self,
         pgb: &mut PostgresBackend,
@@ -426,10 +451,11 @@ impl PageServerHandler {
         timeline_id: TimelineId,
         start_lsn: Lsn,
         end_lsn: Lsn,
+        ctx: RequestContext,
     ) -> Result<(), QueryError> {
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
 
-        let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
+        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
         let last_record_lsn = timeline.get_last_record_lsn();
         if last_record_lsn != start_lsn {
             return Err(QueryError::Other(
@@ -446,7 +472,7 @@ impl PageServerHandler {
         pgb.flush().await?;
         let mut copyin_stream = Box::pin(copyin_stream(pgb));
         let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream);
-        import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn).await?;
+        import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn, &ctx).await?;
         info!("wal import complete");
 
         // Drain the rest of the Copy data
@@ -492,6 +518,7 @@ impl PageServerHandler {
         mut lsn: Lsn,
         latest: bool,
         latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Lsn> {
         if latest {
             // Latest page version was requested. If LSN is given, it is a hint
@@ -515,7 +542,7 @@ impl PageServerHandler {
             if lsn <= last_record_lsn {
                 lsn = last_record_lsn;
             } else {
-                timeline.wait_lsn(lsn).await?;
+                timeline.wait_lsn(lsn, ctx).await?;
                 // Since we waited for 'lsn' to arrive, that is now the last
                 // record LSN. (Or close enough for our purposes; the
                 // last-record LSN can advance immediately after we return
@@ -525,7 +552,7 @@ impl PageServerHandler {
             if lsn == Lsn(0) {
                 anyhow::bail!("invalid LSN(0) in request");
             }
-            timeline.wait_lsn(lsn).await?;
+            timeline.wait_lsn(lsn, ctx).await?;
         }
         anyhow::ensure!(
             lsn >= **latest_gc_cutoff_lsn,
@@ -535,52 +562,60 @@ impl PageServerHandler {
         Ok(lsn)
     }
 
-    #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
+    #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))]
     async fn handle_get_rel_exists_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamExistsRequest,
+        ctx: &RequestContext,
     ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
-            .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
 
-        let exists = timeline.get_rel_exists(req.rel, lsn, req.latest).await?;
+        let exists = timeline
+            .get_rel_exists(req.rel, lsn, req.latest, ctx)
+            .await?;
 
         Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
             exists,
         }))
     }
 
-    #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
+    #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))]
     async fn handle_get_nblocks_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamNblocksRequest,
+        ctx: &RequestContext,
     ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
-            .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
 
-        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest).await?;
+        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;
 
         Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
             n_blocks,
         }))
     }
 
-    #[instrument(skip(self, timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
+    #[instrument(skip(self, timeline, req, ctx), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
     async fn handle_db_size_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamDbSizeRequest,
+        ctx: &RequestContext,
     ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
-            .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
 
         let total_blocks = timeline
-            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)
+            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
             .await?;
         let db_size = total_blocks as i64 * BLCKSZ as i64;
 
@@ -589,15 +624,17 @@ impl PageServerHandler {
         }))
     }
 
-    #[instrument(skip(self, timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
+    #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
     async fn handle_get_page_at_lsn_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamGetPageRequest,
+        ctx: &RequestContext,
     ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
-            .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
         /*
         // Add a 1s delay to some requests. The delay helps the requests to
         // hit the race condition from github issue #1047 more easily.
@@ -608,7 +645,7 @@ impl PageServerHandler {
         */
 
         let page = timeline
-            .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)
+            .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
             .await?;
 
         Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
@@ -616,23 +653,25 @@ impl PageServerHandler {
         }))
     }
 
-    #[instrument(skip(self, pgb))]
+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip(self, pgb, ctx))]
     async fn handle_basebackup_request(
-        &self,
+        &mut self,
         pgb: &mut PostgresBackend,
         tenant_id: TenantId,
         timeline_id: TimelineId,
         lsn: Option<Lsn>,
         prev_lsn: Option<Lsn>,
         full_backup: bool,
+        ctx: RequestContext,
     ) -> anyhow::Result<()> {
         // check that the timeline exists
-        let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
+        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         if let Some(lsn) = lsn {
             // Backup was requested at a particular LSN. Wait for it to arrive.
             info!("waiting for {}", lsn);
-            timeline.wait_lsn(lsn).await?;
+            timeline.wait_lsn(lsn, &ctx).await?;
             timeline
                 .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
                 .context("invalid basebackup lsn")?;
@@ -645,8 +684,15 @@ impl PageServerHandler {
         // Send a tarball of the latest layer on the timeline
         {
             let mut writer = pgb.copyout_writer();
-            basebackup::send_basebackup_tarball(&mut writer, &timeline, lsn, prev_lsn, full_backup)
-                .await?;
+            basebackup::send_basebackup_tarball(
+                &mut writer,
+                &timeline,
+                lsn,
+                prev_lsn,
+                full_backup,
+                &ctx,
+            )
+            .await?;
         }
 
         pgb.write_message(&BeMessage::CopyDone)?;
@@ -717,6 +763,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
         pgb: &mut PostgresBackend,
         query_string: &str,
     ) -> Result<(), QueryError> {
+        let ctx = self.connection_ctx.attached_child();
         debug!("process query {query_string:?}");
 
         if query_string.starts_with("pagestream ") {
@@ -734,7 +781,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
 
             self.check_permission(Some(tenant_id))?;
 
-            self.handle_pagerequests(pgb, tenant_id, timeline_id)
+            self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
                 .await?;
         } else if query_string.starts_with("basebackup ") {
             let (_, params_raw) = query_string.split_at("basebackup ".len());
@@ -763,7 +810,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
             };
 
             // Check that the timeline exists
-            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false)
+            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
                 .await?;
             pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
         }
@@ -784,7 +831,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
                 .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
 
             self.check_permission(Some(tenant_id))?;
-            let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
+            let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
 
             let end_of_timeline = timeline.get_last_record_rlsn();
 
@@ -835,7 +882,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
             self.check_permission(Some(tenant_id))?;
 
             // Check that the timeline exists
-            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true)
+            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true, ctx)
                 .await?;
             pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
         } else if query_string.starts_with("import basebackup ") {
@@ -878,6 +925,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
                     base_lsn,
                     end_lsn,
                     pg_version,
+                    ctx,
                 )
                 .await
             {
@@ -914,7 +962,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
             self.check_permission(Some(tenant_id))?;
 
             match self
-                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn)
+                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
                 .await
             {
                 Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
@@ -944,7 +992,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
 
             self.check_permission(Some(tenant_id))?;
 
-            let tenant = get_active_tenant_with_timeout(tenant_id).await?;
+            let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
             pgb.write_message(&BeMessage::RowDescription(&[
                 RowDescriptor::int8_col(b"checkpoint_distance"),
                 RowDescriptor::int8_col(b"checkpoint_timeout"),
@@ -990,27 +1038,66 @@ impl postgres_backend_async::Handler for PageServerHandler {
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+enum GetActiveTenantError {
+    #[error(
+        "Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}"
+    )]
+    WaitForActiveTimeout {
+        latest_state: TenantState,
+        wait_time: Duration,
+    },
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl From<GetActiveTenantError> for QueryError {
+    fn from(e: GetActiveTenantError) -> Self {
+        match e {
+            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
+                ConnectionError::Socket(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
+            ),
+            GetActiveTenantError::Other(e) => QueryError::Other(e),
+        }
+    }
+}
+
 /// Get active tenant.
 ///
 /// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
 /// ensures that queries don't fail immediately after pageserver startup, because
 /// all tenants are still loading.
-async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> anyhow::Result<Arc<Tenant>> {
+async fn get_active_tenant_with_timeout(
+    tenant_id: TenantId,
+    _ctx: &RequestContext, /* require get a context to support cancellation in the future */
+) -> Result<Arc<Tenant>, GetActiveTenantError> {
     let tenant = mgr::get_tenant(tenant_id, false).await?;
-    match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
-        Ok(wait_result) => wait_result
-            // no .context(), the error message is good enough and some tests depend on it
-            .map(move |()| tenant),
-        Err(_) => anyhow::bail!("Timeout waiting for tenant {tenant_id} to become Active"),
+    let wait_time = Duration::from_secs(30);
+    match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
+        Ok(Ok(())) => Ok(tenant),
+        // no .context(), the error message is good enough and some tests depend on it
+        Ok(Err(wait_error)) => Err(GetActiveTenantError::Other(wait_error)),
+        Err(_) => {
+            let latest_state = tenant.current_state();
+            if latest_state == TenantState::Active {
+                Ok(tenant)
+            } else {
+                Err(GetActiveTenantError::WaitForActiveTimeout {
+                    latest_state,
+                    wait_time,
+                })
+            }
+        }
     }
 }
 
 /// Shorthand for getting a reference to a Timeline of an Active tenant.
-async fn get_active_timeline_with_timeout(
+async fn get_active_tenant_timeline(
     tenant_id: TenantId,
     timeline_id: TimelineId,
-) -> anyhow::Result<Arc<Timeline>> {
-    get_active_tenant_with_timeout(tenant_id)
-        .await
-        .and_then(|tenant| tenant.get_timeline(timeline_id, true))
+    ctx: &RequestContext,
+) -> Result<Arc<Timeline>, GetActiveTenantError> {
+    let tenant = get_active_tenant_with_timeout(tenant_id, ctx).await?;
+    let timeline = tenant.get_timeline(timeline_id, true)?;
+    Ok(timeline)
 }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index cc521c5e35..6f9035305d 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -7,6 +7,7 @@
 //! Clarify that)
 //!
 use super::tenant::{PageReconstructError, Timeline};
+use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
@@ -97,6 +98,7 @@ impl Timeline {
         blknum: BlockNumber,
         lsn: Lsn,
         latest: bool,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         if tag.relnode == 0 {
             return Err(PageReconstructError::Other(anyhow::anyhow!(
@@ -104,7 +106,7 @@ impl Timeline {
             )));
         }
 
-        let nblocks = self.get_rel_size(tag, lsn, latest).await?;
+        let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
         if blknum >= nblocks {
             debug!(
                 "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
@@ -114,7 +116,7 @@ impl Timeline {
         }
 
         let key = rel_block_to_key(tag, blknum);
-        self.get(key, lsn).await
+        self.get(key, lsn, ctx).await
     }
 
     // Get size of a database in blocks
@@ -124,13 +126,14 @@ impl Timeline {
         dbnode: Oid,
         lsn: Lsn,
         latest: bool,
+        ctx: &RequestContext,
     ) -> Result<usize, PageReconstructError> {
         let mut total_blocks = 0;
 
-        let rels = self.list_rels(spcnode, dbnode, lsn).await?;
+        let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;
 
         for rel in rels {
-            let n_blocks = self.get_rel_size(rel, lsn, latest).await?;
+            let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
             total_blocks += n_blocks as usize;
         }
         Ok(total_blocks)
@@ -142,6 +145,7 @@ impl Timeline {
         tag: RelTag,
         lsn: Lsn,
         latest: bool,
+        ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
         if tag.relnode == 0 {
             return Err(PageReconstructError::Other(anyhow::anyhow!(
@@ -154,7 +158,7 @@ impl Timeline {
         }
 
         if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, lsn, latest).await?
+            && !self.get_rel_exists(tag, lsn, latest, ctx).await?
         {
             // FIXME: Postgres sometimes calls smgrcreate() to create
             // FSM, and smgrnblocks() on it immediately afterwards,
@@ -164,7 +168,7 @@ impl Timeline {
         }
 
         let key = rel_size_to_key(tag);
-        let mut buf = self.get(key, lsn).await?;
+        let mut buf = self.get(key, lsn, ctx).await?;
         let nblocks = buf.get_u32_le();
 
         if latest {
@@ -186,6 +190,7 @@ impl Timeline {
         tag: RelTag,
         lsn: Lsn,
         _latest: bool,
+        ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
         if tag.relnode == 0 {
             return Err(PageReconstructError::Other(anyhow::anyhow!(
@@ -199,7 +204,7 @@ impl Timeline {
         }
         // fetch directory listing
         let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
 
         match RelDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -216,10 +221,11 @@ impl Timeline {
         spcnode: Oid,
         dbnode: Oid,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<HashSet<RelTag>, PageReconstructError> {
         // fetch directory listing
         let key = rel_dir_to_key(spcnode, dbnode);
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
 
         match RelDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -244,9 +250,10 @@ impl Timeline {
         segno: u32,
         blknum: BlockNumber,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         let key = slru_block_to_key(kind, segno, blknum);
-        self.get(key, lsn).await
+        self.get(key, lsn, ctx).await
     }
 
     /// Get size of an SLRU segment
@@ -255,9 +262,10 @@ impl Timeline {
         kind: SlruKind,
         segno: u32,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
         let key = slru_segment_size_to_key(kind, segno);
-        let mut buf = self.get(key, lsn).await?;
+        let mut buf = self.get(key, lsn, ctx).await?;
         Ok(buf.get_u32_le())
     }
 
@@ -267,10 +275,11 @@ impl Timeline {
         kind: SlruKind,
         segno: u32,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
         // fetch directory listing
         let key = slru_dir_to_key(kind);
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
 
         match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -291,6 +300,7 @@ impl Timeline {
     pub async fn find_lsn_for_timestamp(
         &self,
         search_timestamp: TimestampTz,
+        ctx: &RequestContext,
     ) -> Result<LsnForTimestamp, PageReconstructError> {
         let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
         let min_lsn = *gc_cutoff_lsn_guard;
@@ -313,6 +323,7 @@ impl Timeline {
                     Lsn(mid * 8),
                     &mut found_smaller,
                     &mut found_larger,
+                    ctx,
                 )
                 .await?;
 
@@ -362,14 +373,18 @@ impl Timeline {
         probe_lsn: Lsn,
         found_smaller: &mut bool,
         found_larger: &mut bool,
+        ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
-        for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn).await? {
+        for segno in self
+            .list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
+            .await?
+        {
             let nblocks = self
-                .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)
+                .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
                 .await?;
             for blknum in (0..nblocks).rev() {
                 let clog_page = self
-                    .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)
+                    .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx)
                     .await?;
 
                 if clog_page.len() == BLCKSZ as usize + 8 {
@@ -394,11 +409,12 @@ impl Timeline {
         &self,
         kind: SlruKind,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<HashSet<u32>, PageReconstructError> {
         // fetch directory entry
         let key = slru_dir_to_key(kind);
 
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
         match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => Ok(dir.segments),
             Err(e) => Err(PageReconstructError::from(e)),
@@ -410,18 +426,21 @@ impl Timeline {
         spcnode: Oid,
         dbnode: Oid,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         let key = relmap_file_key(spcnode, dbnode);
 
-        self.get(key, lsn).await
+        let buf = self.get(key, lsn, ctx).await?;
+        Ok(buf)
     }
 
     pub async fn list_dbdirs(
         &self,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<HashMap<(Oid, Oid), bool>, PageReconstructError> {
         // fetch directory entry
-        let buf = self.get(DBDIR_KEY, lsn).await?;
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
 
         match DbDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => Ok(dir.dbdirs),
@@ -433,18 +452,20 @@ impl Timeline {
         &self,
         xid: TransactionId,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         let key = twophase_file_key(xid);
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
         Ok(buf)
     }
 
     pub async fn list_twophase_files(
         &self,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<HashSet<TransactionId>, PageReconstructError> {
         // fetch directory entry
-        let buf = self.get(TWOPHASEDIR_KEY, lsn).await?;
+        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
 
         match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => Ok(dir.xids),
@@ -452,12 +473,20 @@ impl Timeline {
         }
     }
 
-    pub async fn get_control_file(&self, lsn: Lsn) -> Result<Bytes, PageReconstructError> {
-        self.get(CONTROLFILE_KEY, lsn).await
+    pub async fn get_control_file(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        self.get(CONTROLFILE_KEY, lsn, ctx).await
     }
 
-    pub async fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes, PageReconstructError> {
-        self.get(CHECKPOINT_KEY, lsn).await
+    pub async fn get_checkpoint(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        self.get(CHECKPOINT_KEY, lsn, ctx).await
     }
 
     /// Does the same as get_current_logical_size but counted on demand.
@@ -469,15 +498,16 @@ impl Timeline {
         &self,
         lsn: Lsn,
         cancel: CancellationToken,
+        ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
         // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn).await.context("read dbdir")?;
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
         let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
 
         let mut total_size: u64 = 0;
         for (spcnode, dbnode) in dbdir.dbdirs.keys() {
             for rel in self
-                .list_rels(*spcnode, *dbnode, lsn)
+                .list_rels(*spcnode, *dbnode, lsn, ctx)
                 .await
                 .context("list rels")?
             {
@@ -486,9 +516,9 @@ impl Timeline {
                 }
                 let relsize_key = rel_size_to_key(rel);
                 let mut buf = self
-                    .get(relsize_key, lsn)
+                    .get(relsize_key, lsn, ctx)
                     .await
-                    .context("read relation size of {rel:?}")?;
+                    .with_context(|| format!("read relation size of {rel:?}"))?;
                 let relsize = buf.get_u32_le();
 
                 total_size += relsize as u64;
@@ -501,7 +531,11 @@ impl Timeline {
     /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
     /// Anything that's not listed maybe removed from the underlying storage (from
     /// that LSN forwards).
-    pub async fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result<KeySpace> {
+    pub async fn collect_keyspace(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<KeySpace> {
         // Iterate through key ranges, greedily packing them into partitions
         let mut result = KeySpaceAccum::new();
 
@@ -509,7 +543,7 @@ impl Timeline {
         result.add_key(DBDIR_KEY);
 
         // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn).await?;
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
         let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
 
         let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
@@ -519,14 +553,14 @@ impl Timeline {
             result.add_key(rel_dir_to_key(spcnode, dbnode));
 
             let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, lsn)
+                .list_rels(spcnode, dbnode, lsn, ctx)
                 .await?
                 .into_iter()
                 .collect();
             rels.sort_unstable();
             for rel in rels {
                 let relsize_key = rel_size_to_key(rel);
-                let mut buf = self.get(relsize_key, lsn).await?;
+                let mut buf = self.get(relsize_key, lsn, ctx).await?;
                 let relsize = buf.get_u32_le();
 
                 result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize));
@@ -542,13 +576,13 @@ impl Timeline {
         ] {
             let slrudir_key = slru_dir_to_key(kind);
             result.add_key(slrudir_key);
-            let buf = self.get(slrudir_key, lsn).await?;
+            let buf = self.get(slrudir_key, lsn, ctx).await?;
             let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
             let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
             segments.sort_unstable();
             for segno in segments {
                 let segsize_key = slru_segment_size_to_key(kind, segno);
-                let mut buf = self.get(segsize_key, lsn).await?;
+                let mut buf = self.get(segsize_key, lsn, ctx).await?;
                 let segsize = buf.get_u32_le();
 
                 result.add_range(
@@ -560,7 +594,7 @@ impl Timeline {
 
         // Then pg_twophase
         result.add_key(TWOPHASEDIR_KEY);
-        let buf = self.get(TWOPHASEDIR_KEY, lsn).await?;
+        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
         let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
         let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
         xids.sort_unstable();
@@ -723,9 +757,10 @@ impl<'a> DatadirModification<'a> {
         spcnode: Oid,
         dbnode: Oid,
         img: Bytes,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Add it to the directory (if it doesn't exist already)
-        let buf = self.get(DBDIR_KEY).await?;
+        let buf = self.get(DBDIR_KEY, ctx).await?;
         let mut dbdir = DbDirectory::des(&buf)?;
 
         let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
@@ -755,9 +790,10 @@ impl<'a> DatadirModification<'a> {
         &mut self,
         xid: TransactionId,
         img: Bytes,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Add it to the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY).await?;
+        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
         let mut dir = TwoPhaseDirectory::des(&buf)?;
         if !dir.xids.insert(xid) {
             anyhow::bail!("twophase file for xid {} already exists", xid);
@@ -781,16 +817,21 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
-    pub async fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> {
+    pub async fn drop_dbdir(
+        &mut self,
+        spcnode: Oid,
+        dbnode: Oid,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         let req_lsn = self.tline.get_last_record_lsn();
 
         let total_blocks = self
             .tline
-            .get_db_size(spcnode, dbnode, req_lsn, true)
+            .get_db_size(spcnode, dbnode, req_lsn, true, ctx)
             .await?;
 
         // Remove entry from dbdir
-        let buf = self.get(DBDIR_KEY).await?;
+        let buf = self.get(DBDIR_KEY, ctx).await?;
         let mut dir = DbDirectory::des(&buf)?;
         if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
             let buf = DbDirectory::ser(&dir)?;
@@ -817,11 +858,12 @@ impl<'a> DatadirModification<'a> {
         &mut self,
         rel: RelTag,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         // It's possible that this is the first rel for this db in this
         // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).await?)?;
+        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
         let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
         let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
             // Didn't exist. Update dbdir
@@ -833,7 +875,7 @@ impl<'a> DatadirModification<'a> {
             RelDirectory::default()
         } else {
             // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key).await?)?
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
         };
 
         // Add the new relation to the rel directory entry, and write it back
@@ -865,13 +907,14 @@ impl<'a> DatadirModification<'a> {
         &mut self,
         rel: RelTag,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         let last_lsn = self.tline.get_last_record_lsn();
-        if self.tline.get_rel_exists(rel, last_lsn, true).await? {
+        if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
             let size_key = rel_size_to_key(rel);
             // Fetch the old size first
-            let old_size = self.get(size_key).await?.get_u32_le();
+            let old_size = self.get(size_key, ctx).await?.get_u32_le();
 
             // Update the entry with the new size.
             let buf = nblocks.to_le_bytes();
@@ -895,12 +938,13 @@ impl<'a> DatadirModification<'a> {
         &mut self,
         rel: RelTag,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, "invalid relnode");
 
         // Put size
         let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key).await?.get_u32_le();
+        let old_size = self.get(size_key, ctx).await?.get_u32_le();
 
         // only extend relation here. never decrease the size
         if nblocks > old_size {
@@ -916,12 +960,12 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// Drop a relation.
-    pub async fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> {
+    pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, "invalid relnode");
 
         // Remove it from the directory entry
         let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let buf = self.get(dir_key).await?;
+        let buf = self.get(dir_key, ctx).await?;
         let mut dir = RelDirectory::des(&buf)?;
 
         if dir.rels.remove(&(rel.relnode, rel.forknum)) {
@@ -932,7 +976,7 @@ impl<'a> DatadirModification<'a> {
 
         // update logical size
         let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key).await?.get_u32_le();
+        let old_size = self.get(size_key, ctx).await?.get_u32_le();
         self.pending_nblocks -= old_size as i64;
 
         // Remove enty from relation size cache
@@ -949,10 +993,11 @@ impl<'a> DatadirModification<'a> {
         kind: SlruKind,
         segno: u32,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Add it to the directory entry
         let dir_key = slru_dir_to_key(kind);
-        let buf = self.get(dir_key).await?;
+        let buf = self.get(dir_key, ctx).await?;
         let mut dir = SlruSegmentDirectory::des(&buf)?;
 
         if !dir.segments.insert(segno) {
@@ -988,10 +1033,15 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// This method is used for marking truncated SLRU files
-    pub async fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> {
+    pub async fn drop_slru_segment(
+        &mut self,
+        kind: SlruKind,
+        segno: u32,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         // Remove it from the directory entry
         let dir_key = slru_dir_to_key(kind);
-        let buf = self.get(dir_key).await?;
+        let buf = self.get(dir_key, ctx).await?;
         let mut dir = SlruSegmentDirectory::des(&buf)?;
 
         if !dir.segments.remove(&segno) {
@@ -1015,9 +1065,13 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// This method is used for marking truncated SLRU files
-    pub async fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
+    pub async fn drop_twophase_file(
+        &mut self,
+        xid: TransactionId,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         // Remove it from the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY).await?;
+        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
         let mut dir = TwoPhaseDirectory::des(&buf)?;
 
         if !dir.xids.remove(&xid) {
@@ -1111,7 +1165,7 @@ impl<'a> DatadirModification<'a> {
 
     // Internal helper functions to batch the modifications
 
-    async fn get(&self, key: Key) -> Result<Bytes, PageReconstructError> {
+    async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
         // Have we already updated the same key? Read the pending updated
         // version in that case.
         //
@@ -1132,7 +1186,7 @@ impl<'a> DatadirModification<'a> {
             }
         } else {
             let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
-            self.tline.get(key, lsn).await
+            self.tline.get(key, lsn, ctx).await
         }
     }
 
@@ -1542,10 +1596,11 @@ pub fn create_test_timeline(
     tenant: &crate::tenant::Tenant,
     timeline_id: utils::id::TimelineId,
     pg_version: u32,
+    ctx: &RequestContext,
 ) -> anyhow::Result<std::sync::Arc<Timeline>> {
     let tline = tenant
-        .create_empty_timeline(timeline_id, Lsn(8), pg_version)?
-        .initialize()?;
+        .create_empty_timeline(timeline_id, Lsn(8), pg_version, ctx)?
+        .initialize(ctx)?;
     let mut m = tline.begin_modification(Lsn(8));
     m.init_empty()?;
     m.commit()?;
@@ -1598,7 +1653,7 @@ mod tests {
         assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A));
 
         // Create a branch, check that the relation is visible there
-        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
+        repo.branch_timeline(&tline, NEW_TIMELINE_ID, Lsn(0x30))?;
         let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
             Some(timeline) => timeline,
             None => panic!("Should have a local timeline"),
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 586fd20886..092503b7c5 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -37,6 +37,17 @@ impl Key {
             | self.field6 as i128
     }
 
+    pub fn from_i128(x: i128) -> Self {
+        Key {
+            field1: ((x >> 120) & 0xf) as u8,
+            field2: ((x >> 104) & 0xFFFF) as u32,
+            field3: (x >> 72) as u32,
+            field4: (x >> 40) as u32,
+            field5: (x >> 32) as u8,
+            field6: x as u32,
+        }
+    }
+
     pub fn next(&self) -> Key {
         self.add(1)
     }
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 02e2e2ee14..09716ba0e0 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -171,6 +171,9 @@ task_local! {
 ///
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub enum TaskKind {
+    // Pageserver startup, i.e., `main`
+    Startup,
+
     // libpq listener task. It just accepts connection and spawns a
     // PageRequestHandler task for each connection.
     LibpqEndpointListener,
@@ -183,13 +186,37 @@ pub enum TaskKind {
     // associated with one later, after receiving a command from the client.
     PageRequestHandler,
 
-    // Manages the WAL receiver connection for one timeline. It subscribes to
-    // events from storage_broker, decides which safekeeper to connect to. It spawns a
-    // separate WalReceiverConnection task to handle each connection.
+    /// Manages the WAL receiver connection for one timeline.
+    /// It subscribes to events from storage_broker and decides which safekeeper to connect to.
+    /// Once the decision has been made, it establishes the connection using the `tokio-postgres` library.
+    /// There is at most one connection at any given time.
+    ///
+    /// That `tokio-postgres` library represents a connection as two objects: a `Client` and a `Connection`.
+    /// The `Client` object is what library users use to make requests & get responses.
+    /// Internally, `Client` hands over requests to the `Connection` object.
+    /// The `Connection` object is responsible for speaking the wire protocol.
+    ///
+    /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
+    /// That abstraction doesn't use `task_mgr`.
+    /// The [`WalReceiverManager`] task ensures that this `TaskHandle` task does not outlive the [`WalReceiverManager`] task.
+    /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
+    ///
+    /// Once the connection is established, the `TaskHandle` task creates a
+    /// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
+    /// the `Connection` object.
+    /// A `CancellationToken` created by the `TaskHandle` task ensures
+    /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
     WalReceiverManager,
 
-    // Handles a connection to a safekeeper, to stream WAL to a timeline.
-    WalReceiverConnection,
+    /// The `TaskHandle` task that executes [`walreceiver_connection::handle_walreceiver_connection`].
+    /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
+    /// See the comment on [`WalReceiverManager`].
+    WalReceiverConnectionHandler,
+
+    /// The task that polls the `tokio-postgres::Connection` object.
+    /// Spawned by task [`WalReceiverConnectionHandler`].
+    /// See the comment on [`WalReceiverManager`].
+    WalReceiverConnectionPoller,
 
     // Garbage collection worker. One per tenant
     GarbageCollector,
@@ -200,6 +227,8 @@ pub enum TaskKind {
     // Initial logical size calculation
     InitialLogicalSizeCalculation,
 
+    OndemandLogicalSizeCalculation,
+
     // Task that flushes frozen in-memory layers to disk
     LayerFlushTask,
 
@@ -222,6 +251,12 @@ pub enum TaskKind {
     DownloadAllRemoteLayers,
     // Task that calculates synthetis size for all active tenants
     CalculateSyntheticSize,
+
+    // A request that comes in via the pageserver HTTP API.
+    MgmtRequest,
+
+    #[cfg(test)]
+    UnitTest,
 }
 
 #[derive(Default)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c18c645e5b..2f45fe0dfc 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -48,9 +48,10 @@ use std::time::{Duration, Instant};
 use self::metadata::TimelineMetadata;
 use self::remote_timeline_client::RemoteTimelineClient;
 use crate::config::PageServerConf;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir;
 use crate::is_uninit_mark;
-use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
+use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC};
 use crate::repository::GcResult;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
@@ -174,7 +175,7 @@ impl UninitializedTimeline<'_> {
     ///
     /// The new timeline is initialized in Active state, and its background jobs are
     /// started
-    pub fn initialize(self) -> anyhow::Result<Arc<Timeline>> {
+    pub fn initialize(self, _ctx: &RequestContext) -> anyhow::Result<Arc<Timeline>> {
         let mut timelines = self.owning_tenant.timelines.lock().unwrap();
         self.initialize_with_lock(&mut timelines, true, true)
     }
@@ -188,7 +189,7 @@ impl UninitializedTimeline<'_> {
         mut self,
         timelines: &mut HashMap<TimelineId, Arc<Timeline>>,
         load_layer_map: bool,
-        launch_wal_receiver: bool,
+        activate: bool,
     ) -> anyhow::Result<Arc<Timeline>> {
         let timeline_id = self.timeline_id;
         let tenant_id = self.owning_tenant.tenant_id;
@@ -221,13 +222,12 @@ impl UninitializedTimeline<'_> {
                         "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
                     )
                 })?;
-                new_timeline.set_state(TimelineState::Active);
                 v.insert(Arc::clone(&new_timeline));
 
                 new_timeline.maybe_spawn_flush_loop();
 
-                if launch_wal_receiver {
-                    new_timeline.launch_wal_receiver();
+                if activate {
+                    new_timeline.activate();
                 }
             }
         }
@@ -240,11 +240,12 @@ impl UninitializedTimeline<'_> {
         self,
         copyin_stream: &mut (impl Stream<Item = io::Result<Bytes>> + Sync + Send + Unpin),
         base_lsn: Lsn,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
         let raw_timeline = self.raw_timeline()?;
 
         let mut reader = tokio_util::io::StreamReader::new(copyin_stream);
-        import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn)
+        import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn, ctx)
             .await
             .context("Failed to import basebackup")?;
 
@@ -262,9 +263,7 @@ impl UninitializedTimeline<'_> {
             .await
             .context("Failed to flush after basebackup import")?;
 
-        let timeline = self.initialize()?;
-
-        Ok(timeline)
+        self.initialize(ctx)
     }
 
     fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
@@ -450,6 +449,7 @@ impl Tenant {
     ///
     /// If the operation fails, the timeline is left in the tenant's hash map in Broken state. On success,
     /// it is marked as Active.
+    #[allow(clippy::too_many_arguments)]
     async fn timeline_init_and_sync(
         &self,
         timeline_id: TimelineId,
@@ -458,6 +458,7 @@ impl Tenant {
         local_metadata: Option<TimelineMetadata>,
         ancestor: Option<Arc<Timeline>>,
         first_save: bool,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let tenant_id = self.tenant_id;
 
@@ -573,6 +574,7 @@ impl Tenant {
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         remote_storage: GenericRemoteStorage,
+        ctx: &RequestContext,
     ) -> Arc<Tenant> {
         // XXX: Attach should provide the config, especially during tenant migration.
         //      See https://github.com/neondatabase/neon/issues/1555
@@ -591,6 +593,7 @@ impl Tenant {
         // Do all the hard work in the background
         let tenant_clone = Arc::clone(&tenant);
 
+        let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
         task_mgr::spawn(
             &tokio::runtime::Handle::current(),
             TaskKind::Attach,
@@ -599,7 +602,7 @@ impl Tenant {
             "attach tenant",
             false,
             async move {
-                match tenant_clone.attach().await {
+                match tenant_clone.attach(ctx).await {
                     Ok(_) => {}
                     Err(e) => {
                         tenant_clone.set_broken(&e.to_string());
@@ -615,8 +618,8 @@ impl Tenant {
     ///
     /// Background task that downloads all data for a tenant and brings it to Active state.
     ///
-    #[instrument(skip(self), fields(tenant_id=%self.tenant_id))]
-    async fn attach(self: &Arc<Tenant>) -> anyhow::Result<()> {
+    #[instrument(skip(self, ctx), fields(tenant_id=%self.tenant_id))]
+    async fn attach(self: &Arc<Tenant>, ctx: RequestContext) -> anyhow::Result<()> {
         // Create directory with marker file to indicate attaching state.
         // The load_local_tenants() function in tenant::mgr relies on the marker file
         // to determine whether a tenant has finished attaching.
@@ -716,6 +719,7 @@ impl Tenant {
                 index_parts.remove(&timeline_id).unwrap(),
                 remote_metadata,
                 remote_clients.remove(&timeline_id).unwrap(),
+                &ctx,
             )
             .await
             .with_context(|| {
@@ -765,6 +769,7 @@ impl Tenant {
         index_part: IndexPart,
         remote_metadata: TimelineMetadata,
         remote_client: RemoteTimelineClient,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         info!("downloading index file for timeline {}", timeline_id);
         tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id))
@@ -799,6 +804,7 @@ impl Tenant {
             local_metadata,
             ancestor,
             true,
+            ctx,
         )
         .await
     }
@@ -827,11 +833,12 @@ impl Tenant {
     /// If the loading fails for some reason, the Tenant will go into Broken
     /// state.
     ///
-    #[instrument(skip(conf, remote_storage), fields(tenant_id=%tenant_id))]
+    #[instrument(skip(conf, remote_storage, ctx), fields(tenant_id=%tenant_id))]
     pub fn spawn_load(
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         remote_storage: Option<GenericRemoteStorage>,
+        ctx: &RequestContext,
     ) -> Arc<Tenant> {
         let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
             Ok(conf) => conf,
@@ -855,6 +862,7 @@ impl Tenant {
         // Do all the hard work in a background task
         let tenant_clone = Arc::clone(&tenant);
 
+        let ctx = ctx.detached_child(TaskKind::InitialLoad, DownloadBehavior::Warn);
         let _ = task_mgr::spawn(
             &tokio::runtime::Handle::current(),
             TaskKind::InitialLoad,
@@ -863,7 +871,7 @@ impl Tenant {
             "initial tenant load",
             false,
             async move {
-                match tenant_clone.load().await {
+                match tenant_clone.load(&ctx).await {
                     Ok(()) => {}
                     Err(err) => {
                         tenant_clone.set_broken(&err.to_string());
@@ -884,8 +892,8 @@ impl Tenant {
     /// Background task to load in-memory data structures for this tenant, from
     /// files on disk. Used at pageserver startup.
     ///
-    #[instrument(skip(self), fields(tenant_id=%self.tenant_id))]
-    async fn load(self: &Arc<Tenant>) -> anyhow::Result<()> {
+    #[instrument(skip(self, ctx), fields(tenant_id=%self.tenant_id))]
+    async fn load(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
         info!("loading tenant task");
 
         utils::failpoint_sleep_millis_async!("before-loading-tenant");
@@ -996,7 +1004,7 @@ impl Tenant {
         //    1. "Timeline has no ancestor and no layer files"
 
         for (timeline_id, local_metadata) in sorted_timelines {
-            self.load_local_timeline(timeline_id, local_metadata)
+            self.load_local_timeline(timeline_id, local_metadata, ctx)
                 .await
                 .with_context(|| format!("load local timeline {timeline_id}"))?;
         }
@@ -1013,11 +1021,12 @@ impl Tenant {
     /// Subroutine of `load_tenant`, to load an individual timeline
     ///
     /// NB: The parent is assumed to be already loaded!
-    #[instrument(skip(self, local_metadata), fields(timeline_id=%timeline_id))]
+    #[instrument(skip(self, local_metadata, ctx), fields(timeline_id=%timeline_id))]
     async fn load_local_timeline(
         &self,
         timeline_id: TimelineId,
         local_metadata: TimelineMetadata,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
             let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
@@ -1061,6 +1070,7 @@ impl Tenant {
             Some(local_metadata),
             ancestor,
             false,
+            ctx,
         )
         .await
     }
@@ -1112,6 +1122,7 @@ impl Tenant {
         new_timeline_id: TimelineId,
         initdb_lsn: Lsn,
         pg_version: u32,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<UninitializedTimeline> {
         anyhow::ensure!(
             self.is_active(),
@@ -1153,6 +1164,7 @@ impl Tenant {
         ancestor_timeline_id: Option<TimelineId>,
         mut ancestor_start_lsn: Option<Lsn>,
         pg_version: u32,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Option<Arc<Timeline>>> {
         anyhow::ensure!(
             self.is_active(),
@@ -1190,13 +1202,16 @@ impl Tenant {
                     // decoding the new WAL might need to look up previous pages, relation
                     // sizes etc. and that would get confused if the previous page versions
                     // are not in the repository yet.
-                    ancestor_timeline.wait_lsn(*lsn).await?;
+                    ancestor_timeline.wait_lsn(*lsn, ctx).await?;
                 }
 
-                self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)
+                self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx)
+                    .await?
+            }
+            None => {
+                self.bootstrap_timeline(new_timeline_id, pg_version, ctx)
                     .await?
             }
-            None => self.bootstrap_timeline(new_timeline_id, pg_version).await?,
         };
 
         Ok(Some(loaded_timeline))
@@ -1220,30 +1235,25 @@ impl Tenant {
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
+        ctx: &RequestContext,
     ) -> anyhow::Result<GcResult> {
         anyhow::ensure!(
             self.is_active(),
             "Cannot run GC iteration on inactive tenant"
         );
 
-        let timeline_str = target_timeline_id
-            .map(|x| x.to_string())
-            .unwrap_or_else(|| "-".to_string());
+        let gc_result = self
+            .gc_iteration_internal(target_timeline_id, horizon, pitr, ctx)
+            .await;
 
-        {
-            let _timer = STORAGE_TIME
-                .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
-                .start_timer();
-            self.gc_iteration_internal(target_timeline_id, horizon, pitr)
-                .await
-        }
+        gc_result
     }
 
     /// Perform one compaction iteration.
     /// This function is periodically called by compactor task.
     /// Also it can be explicitly requested per timeline through page server
     /// api's 'compact' command.
-    pub async fn compaction_iteration(&self) -> anyhow::Result<()> {
+    pub async fn compaction_iteration(&self, ctx: &RequestContext) -> anyhow::Result<()> {
         anyhow::ensure!(
             self.is_active(),
             "Cannot run compaction iteration on inactive tenant"
@@ -1265,7 +1275,7 @@ impl Tenant {
 
         for (timeline_id, timeline) in &timelines_to_compact {
             timeline
-                .compact()
+                .compact(ctx)
                 .instrument(info_span!("compact_timeline", timeline = %timeline_id))
                 .await?;
         }
@@ -1298,7 +1308,11 @@ impl Tenant {
     }
 
     /// Removes timeline-related in-memory data
-    pub async fn delete_timeline(&self, timeline_id: TimelineId) -> anyhow::Result<()> {
+    pub async fn delete_timeline(
+        &self,
+        timeline_id: TimelineId,
+        _ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         // Transition the timeline into TimelineState::Stopping.
         // This should prevent new operations from starting.
         let timeline = {
@@ -1462,8 +1476,7 @@ impl Tenant {
                     tasks::start_background_loops(self.tenant_id);
 
                     for timeline in not_broken_timelines {
-                        timeline.set_state(TimelineState::Active);
-                        timeline.launch_wal_receiver();
+                        timeline.activate();
                     }
                 }
             }
@@ -1487,7 +1500,7 @@ impl Tenant {
                         .values()
                         .filter(|timeline| timeline.current_state() != TimelineState::Broken);
                     for timeline in not_broken_timelines {
-                        timeline.set_state(TimelineState::Suspended);
+                        timeline.set_state(TimelineState::Stopping);
                     }
                 }
                 TenantState::Broken => {
@@ -1717,7 +1730,33 @@ impl Tenant {
         tenant_id: TenantId,
         remote_storage: Option<GenericRemoteStorage>,
     ) -> Tenant {
-        let (state, _) = watch::channel(state);
+        let (state, mut rx) = watch::channel(state);
+
+        tokio::spawn(async move {
+            let current_state = *rx.borrow_and_update();
+            let tid = tenant_id.to_string();
+            TENANT_STATE_METRIC
+                .with_label_values(&[&tid, current_state.as_str()])
+                .inc();
+            loop {
+                match rx.changed().await {
+                    Ok(()) => {
+                        let new_state = *rx.borrow();
+                        TENANT_STATE_METRIC
+                            .with_label_values(&[&tid, current_state.as_str()])
+                            .dec();
+                        TENANT_STATE_METRIC
+                            .with_label_values(&[&tid, new_state.as_str()])
+                            .inc();
+                    }
+                    Err(_sender_dropped_error) => {
+                        info!("Tenant dropped the state updates sender, quitting waiting for tenant state change");
+                        return;
+                    }
+                }
+            }
+        });
+
         Tenant {
             tenant_id,
             conf,
@@ -1776,69 +1815,70 @@ impl Tenant {
     }
 
     pub(super) fn persist_tenant_config(
+        tenant_id: &TenantId,
         target_config_path: &Path,
         tenant_conf: TenantConfOpt,
-        first_save: bool,
+        creating_tenant: bool,
     ) -> anyhow::Result<()> {
         let _enter = info_span!("saving tenantconf").entered();
-        info!("persisting tenantconf to {}", target_config_path.display());
 
-        // TODO this will prepend comments endlessly ?
-        let mut conf_content = r#"# This file contains a specific per-tenant's config.
-#  It is read in case of pageserver restart.
-
-[tenant_config]
-"#
-        .to_string();
-
-        // Convert the config to a toml file.
-        conf_content += &toml_edit::easy::to_string(&tenant_conf)?;
-
-        let mut target_config_file = VirtualFile::open_with_options(
-            target_config_path,
-            OpenOptions::new()
-                .truncate(true) // This needed for overwriting with small config files
-                .write(true)
-                .create_new(first_save),
-        )?;
-
-        target_config_file
-            .write(conf_content.as_bytes())
-            .context("Failed to write toml bytes into file")
-            .and_then(|_| {
-                target_config_file
-                    .sync_all()
-                    .context("Faile to fsync config file")
-            })
-            .with_context(|| {
+        // imitate a try-block with a closure
+        let do_persist = |target_config_path: &Path| -> anyhow::Result<()> {
+            let target_config_parent = target_config_path.parent().with_context(|| {
                 format!(
-                    "Failed to write config file into path '{}'",
+                    "Config path does not have a parent: {}",
                     target_config_path.display()
                 )
             })?;
 
-        // fsync the parent directory to ensure the directory entry is durable
-        if first_save {
-            target_config_path
-                .parent()
-                .context("Config file does not have a parent")
-                .and_then(|target_config_parent| {
-                    File::open(target_config_parent).context("Failed to open config parent")
-                })
-                .and_then(|tenant_dir| {
-                    tenant_dir
-                        .sync_all()
-                        .context("Failed to fsync config parent")
-                })
-                .with_context(|| {
-                    format!(
-                        "Failed to fsync on first save for config {}",
-                        target_config_path.display()
-                    )
-                })?;
-        }
+            info!("persisting tenantconf to {}", target_config_path.display());
 
-        Ok(())
+            let mut conf_content = r#"# This file contains a specific per-tenant's config.
+#  It is read in case of pageserver restart.
+
+[tenant_config]
+"#
+            .to_string();
+
+            // Convert the config to a toml file.
+            conf_content += &toml_edit::easy::to_string(&tenant_conf)?;
+
+            let mut target_config_file = VirtualFile::open_with_options(
+                target_config_path,
+                OpenOptions::new()
+                    .truncate(true) // This needed for overwriting with small config files
+                    .write(true)
+                    .create_new(creating_tenant)
+                    // when creating a new tenant, first_save will be true and `.create(true)` will be
+                    // ignored (per rust std docs).
+                    //
+                    // later when updating the config of created tenant, or persisting config for the
+                    // first time for attached tenant, the `.create(true)` is used.
+                    .create(true),
+            )?;
+
+            target_config_file
+                .write(conf_content.as_bytes())
+                .context("write toml bytes into file")
+                .and_then(|_| target_config_file.sync_all().context("fsync config file"))
+                .context("write config file")?;
+
+            // fsync the parent directory to ensure the directory entry is durable.
+            // before this was done conditionally on creating_tenant, but these management actions are rare
+            // enough to just fsync it always.
+
+            crashsafe::fsync(target_config_parent)?;
+            Ok(())
+        };
+
+        // this function is called from creating the tenant and updating the tenant config, which
+        // would otherwise share this context, so keep it here in one place.
+        do_persist(target_config_path).with_context(|| {
+            format!(
+                "write tenant {tenant_id} config to {}",
+                target_config_path.display()
+            )
+        })
     }
 
     //
@@ -1871,12 +1911,13 @@ impl Tenant {
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
+        ctx: &RequestContext,
     ) -> anyhow::Result<GcResult> {
         let mut totals: GcResult = Default::default();
         let now = Instant::now();
 
         let gc_timelines = self
-            .refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+            .refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx)
             .await?;
 
         utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
@@ -1917,7 +1958,10 @@ impl Tenant {
     /// [`Tenant::get_gc_horizon`].
     ///
     /// This is usually executed as part of periodic gc, but can now be triggered more often.
-    pub async fn refresh_gc_info(&self) -> anyhow::Result<Vec<Arc<Timeline>>> {
+    pub async fn refresh_gc_info(
+        &self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
         // since this method can now be called at different rates than the configured gc loop, it
         // might be that these configuration values get applied faster than what it was previously,
         // since these were only read from the gc task.
@@ -1927,7 +1971,7 @@ impl Tenant {
         // refresh all timelines
         let target_timeline_id = None;
 
-        self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+        self.refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx)
             .await
     }
 
@@ -1936,6 +1980,7 @@ impl Tenant {
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Vec<Arc<Timeline>>> {
         // grab mutex to prevent new timelines from being created here.
         let gc_cs = self.gc_cs.lock().await;
@@ -2007,7 +2052,9 @@ impl Tenant {
                     ))
                     .map(|&x| x.1)
                     .collect();
-                timeline.update_gc_info(branchpoints, cutoff, pitr).await?;
+                timeline
+                    .update_gc_info(branchpoints, cutoff, pitr, ctx)
+                    .await?;
 
                 gc_timelines.push(timeline);
             }
@@ -2019,53 +2066,53 @@ impl Tenant {
     /// Branch an existing timeline
     async fn branch_timeline(
         &self,
-        src: TimelineId,
-        dst: TimelineId,
+        src_timeline: &Arc<Timeline>,
+        dst_id: TimelineId,
         start_lsn: Option<Lsn>,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
-        // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn
-        // about timelines, so otherwise a race condition is possible, where we create new timeline and GC
-        // concurrently removes data that is needed by the new timeline.
-        let _gc_cs = self.gc_cs.lock().await;
-        let timeline_uninit_mark = {
-            let timelines = self.timelines.lock().unwrap();
-            self.create_timeline_uninit_mark(dst, &timelines)?
-        };
-
-        // In order for the branch creation task to not wait for GC/compaction,
-        // we need to make sure that the starting LSN of the child branch is not out of scope midway by
-        //
-        // 1. holding the GC lock to prevent overwritting timeline's GC data
-        // 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline
-        //
-        // Step 2 is to avoid initializing the new branch using data removed by past GC iterations
-        // or in-queue GC iterations.
-
-        let src_timeline = self.get_timeline(src, false).with_context(|| {
-            format!(
-                "No ancestor {} found for timeline {}/{}",
-                src, self.tenant_id, dst
-            )
-        })?;
-
-        let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
+        let src_id = src_timeline.timeline_id;
 
         // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
         let start_lsn = start_lsn.unwrap_or_else(|| {
             let lsn = src_timeline.get_last_record_lsn();
-            info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}");
+            info!("branching timeline {dst_id} from timeline {src_id} at last record LSN: {lsn}");
             lsn
         });
 
-        // Check if the starting LSN is out of scope because it is less than
-        // 1. the latest GC cutoff LSN or
-        // 2. the planned GC cutoff LSN, which is from an in-queue GC iteration.
+        // First acquire the GC lock so that another task cannot advance the GC
+        // cutoff in 'gc_info', and make 'start_lsn' invalid, while we are
+        // creating the branch.
+        let _gc_cs = self.gc_cs.lock().await;
+
+        // Create a placeholder for the new branch. This will error
+        // out if the new timeline ID is already in use.
+        let timeline_uninit_mark = {
+            let timelines = self.timelines.lock().unwrap();
+            self.create_timeline_uninit_mark(dst_id, &timelines)?
+        };
+
+        // Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR
+        // horizon on the source timeline
+        //
+        // We check it against both the planned GC cutoff stored in 'gc_info',
+        // and the 'latest_gc_cutoff' of the last GC that was performed.  The
+        // planned GC cutoff in 'gc_info' is normally larger than
+        // 'latest_gc_cutoff_lsn', but beware of corner cases like if you just
+        // changed the GC settings for the tenant to make the PITR window
+        // larger, but some of the data was already removed by an earlier GC
+        // iteration.
+
+        // check against last actual 'latest_gc_cutoff' first
+        let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
         src_timeline
             .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
             .context(format!(
                 "invalid branch start lsn: less than latest GC cutoff {}",
                 *latest_gc_cutoff_lsn,
             ))?;
+
+        // and then the planned GC cutoff
         {
             let gc_info = src_timeline.gc_info.read().unwrap();
             let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff);
@@ -2076,6 +2123,12 @@ impl Tenant {
             }
         }
 
+        //
+        // The branch point is valid, and we are still holding the 'gc_cs' lock
+        // so that GC cannot advance the GC cutoff until we are finished.
+        // Proceed with the branch creation.
+        //
+
         // Determine prev-LSN for the new timeline. We can only determine it if
         // the timeline was branched at the current end of the source timeline.
         let RecordLsn {
@@ -2094,7 +2147,7 @@ impl Tenant {
         let metadata = TimelineMetadata::new(
             start_lsn,
             dst_prev,
-            Some(src),
+            Some(src_id),
             start_lsn,
             *src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
             src_timeline.initdb_lsn,
@@ -2103,15 +2156,15 @@ impl Tenant {
         let mut timelines = self.timelines.lock().unwrap();
         let new_timeline = self
             .prepare_timeline(
-                dst,
+                dst_id,
                 metadata,
                 timeline_uninit_mark,
                 false,
-                Some(src_timeline),
+                Some(Arc::clone(src_timeline)),
             )?
             .initialize_with_lock(&mut timelines, true, true)?;
         drop(timelines);
-        info!("branched timeline {dst} from {src} at {start_lsn}");
+        info!("branched timeline {dst_id} from {src_id} at {start_lsn}");
 
         Ok(new_timeline)
     }
@@ -2122,6 +2175,7 @@ impl Tenant {
         &self,
         timeline_id: TimelineId,
         pg_version: u32,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
         let timeline_uninit_mark = {
             let timelines = self.timelines.lock().unwrap();
@@ -2181,6 +2235,7 @@ impl Tenant {
             unfinished_timeline,
             pgdata_path,
             pgdata_lsn,
+            ctx,
         )
         .await
         .with_context(|| {
@@ -2352,7 +2407,10 @@ impl Tenant {
     ///
     /// Future is cancellation safe. Only one calculation can be running at once per tenant.
     #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
-    pub async fn gather_size_inputs(&self) -> anyhow::Result<size::ModelInputs> {
+    pub async fn gather_size_inputs(
+        &self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<size::ModelInputs> {
         let logical_sizes_at_once = self
             .conf
             .concurrent_tenant_size_logical_size_queries
@@ -2364,15 +2422,15 @@ impl Tenant {
         // See more for on the issue #2748 condenced out of the initial PR review.
         let mut shared_cache = self.cached_logical_sizes.lock().await;
 
-        size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache).await
+        size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache, ctx).await
     }
 
     /// Calculate synthetic tenant size
     /// This is periodically called by background worker.
     /// result is cached in tenant struct
     #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
-    pub async fn calculate_synthetic_size(&self) -> anyhow::Result<u64> {
-        let inputs = self.gather_size_inputs().await?;
+    pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result<u64> {
+        let inputs = self.gather_size_inputs(ctx).await?;
 
         let size = inputs.calculate()?;
 
@@ -2475,26 +2533,19 @@ fn try_create_target_tenant_dir(
         target_tenant_directory,
         temporary_tenant_dir,
     )
-    .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary timelines dir"))?;
+    .with_context(|| format!("resolve tenant {tenant_id} temporary timelines dir"))?;
     let temporary_tenant_config_path = rebase_directory(
         &conf.tenant_config_path(tenant_id),
         target_tenant_directory,
         temporary_tenant_dir,
     )
-    .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary config path"))?;
+    .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;
+
+    Tenant::persist_tenant_config(&tenant_id, &temporary_tenant_config_path, tenant_conf, true)?;
 
-    Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true).with_context(
-        || {
-            format!(
-                "Failed to write tenant {} config to {}",
-                tenant_id,
-                temporary_tenant_config_path.display()
-            )
-        },
-    )?;
     crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
         format!(
-            "could not create tenant {} temporary timelines directory {}",
+            "create tenant {} temporary timelines directory {}",
             tenant_id,
             temporary_tenant_timelines_dir.display()
         )
@@ -2505,7 +2556,7 @@ fn try_create_target_tenant_dir(
 
     fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| {
         format!(
-            "failed to move tenant {} temporary directory {} into the permanent one {}",
+            "move tenant {} temporary directory {} into the permanent one {}",
             tenant_id,
             temporary_tenant_dir.display(),
             target_tenant_directory.display()
@@ -2513,14 +2564,14 @@ fn try_create_target_tenant_dir(
     })?;
     let target_dir_parent = target_tenant_directory.parent().with_context(|| {
         format!(
-            "Failed to get tenant {} dir parent for {}",
+            "get tenant {} dir parent for {}",
             tenant_id,
             target_tenant_directory.display()
         )
     })?;
     crashsafe::fsync(target_dir_parent).with_context(|| {
         format!(
-            "Failed to fsync renamed directory's parent {} for tenant {}",
+            "fsync renamed directory's parent {} for tenant {}",
             target_dir_parent.display(),
             tenant_id,
         )
@@ -2743,11 +2794,17 @@ pub mod harness {
             })
         }
 
-        pub async fn load(&self) -> Arc<Tenant> {
-            self.try_load().await.expect("failed to load test tenant")
+        pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
+            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+            (
+                self.try_load(&ctx)
+                    .await
+                    .expect("failed to load test tenant"),
+                ctx,
+            )
         }
 
-        pub async fn try_load(&self) -> anyhow::Result<Arc<Tenant>> {
+        pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
             let walredo_mgr = Arc::new(TestRedoManager);
 
             let tenant = Arc::new(Tenant::new(
@@ -2775,8 +2832,7 @@ pub mod harness {
                 timelines_to_load.insert(timeline_id, timeline_metadata);
             }
             // FIXME starts background jobs
-            tenant.load().await?;
-
+            tenant.load(ctx).await?;
             Ok(tenant)
         }
 
@@ -2833,10 +2889,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_basic() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_basic")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
@@ -2849,15 +2904,15 @@ mod tests {
         drop(writer);
 
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x10)).await?,
+            tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
             TEST_IMG("foo at 0x10")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x1f)).await?,
+            tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
             TEST_IMG("foo at 0x10")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x20)).await?,
+            tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
             TEST_IMG("foo at 0x20")
         );
 
@@ -2866,14 +2921,14 @@ mod tests {
 
     #[tokio::test]
     async fn no_duplicate_timelines() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("no_duplicate_timelines")?
+        let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")?
             .load()
             .await;
-        let _ = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let timeline =
+            tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let _ = timeline.initialize(&ctx)?;
 
-        match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION) {
+        match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) {
             Ok(_) => panic!("duplicate timeline creation should fail"),
             Err(e) => assert_eq!(
                 e.to_string(),
@@ -2899,13 +2954,13 @@ mod tests {
     ///
     #[tokio::test]
     async fn test_branch() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_branch")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
-        let writer = tline.writer();
         use std::str::from_utf8;
 
+        let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
+        let writer = tline.writer();
+
         #[allow(non_snake_case)]
         let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
         #[allow(non_snake_case)]
@@ -2925,7 +2980,7 @@ mod tests {
 
         // Branch the history, modify relation differently on the new timeline
         tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x30)), &ctx)
             .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
@@ -2936,15 +2991,15 @@ mod tests {
 
         // Check page contents on both branches
         assert_eq!(
-            from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40)).await?)?,
+            from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40), &ctx).await?)?,
             "foo at 0x40"
         );
         assert_eq!(
-            from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40)).await?)?,
+            from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40), &ctx).await?)?,
             "bar at 0x40"
         );
         assert_eq!(
-            from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40)).await?)?,
+            from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40), &ctx).await?)?,
             "foobar at 0x20"
         );
 
@@ -2996,13 +3051,12 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
-        let tenant =
+        let (tenant, ctx) =
             TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
                 .load()
                 .await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
@@ -3010,12 +3064,12 @@ mod tests {
         // and compaction works. But it does set the 'cutoff' point so that the cross check
         // below should fail.
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
             .await?;
 
         // try to branch at lsn 25, should fail because we already garbage collected the data
         match tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx)
             .await
         {
             Ok(_) => panic!("branching should have failed"),
@@ -3034,16 +3088,17 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
-            .load()
-            .await;
+        let (tenant, ctx) =
+            TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
+                .load()
+                .await;
 
-        tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let tline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION, &ctx)?
+            .initialize(&ctx)?;
         // try to branch at lsn 0x25, should fail because initdb lsn is 0x50
         match tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx)
             .await
         {
             Ok(_) => panic!("branching should have failed"),
@@ -3085,40 +3140,40 @@ mod tests {
 
     #[tokio::test]
     async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
-            .load()
-            .await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) =
+            TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
+                .load()
+                .await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
             .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
         // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
             .await?;
-        assert!(newtline.get(*TEST_KEY, Lsn(0x25)).await.is_ok());
+        assert!(newtline.get(*TEST_KEY, Lsn(0x25), &ctx).await.is_ok());
 
         Ok(())
     }
     #[tokio::test]
     async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
-            .load()
-            .await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) =
+            TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
+                .load()
+                .await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
             .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
@@ -3128,12 +3183,12 @@ mod tests {
 
         // run gc on parent
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
             .await?;
 
         // Check that the data is still accessible on the branch.
         assert_eq!(
-            newtline.get(*TEST_KEY, Lsn(0x50)).await?,
+            newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?,
             TEST_IMG(&format!("foo at {}", Lsn(0x40)))
         );
 
@@ -3145,14 +3200,14 @@ mod tests {
         const TEST_NAME: &str = "timeline_load";
         let harness = TenantHarness::create(TEST_NAME)?;
         {
-            let tenant = harness.load().await;
-            let tline = tenant
-                .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?
-                .initialize()?;
+            let (tenant, ctx) = harness.load().await;
+            let tline =
+                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION, &ctx)?;
+            let tline = tline.initialize(&ctx)?;
             make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
         }
 
-        let tenant = harness.load().await;
+        let (tenant, _ctx) = harness.load().await;
         tenant
             .get_timeline(TIMELINE_ID, true)
             .expect("cannot load timeline");
@@ -3166,15 +3221,15 @@ mod tests {
         let harness = TenantHarness::create(TEST_NAME)?;
         // create two timelines
         {
-            let tenant = harness.load().await;
-            let tline = tenant
-                .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-                .initialize()?;
+            let (tenant, ctx) = harness.load().await;
+            let tline =
+                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+            let tline = tline.initialize(&ctx)?;
 
             make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
             tenant
-                .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+                .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
                 .await?;
 
             let newtline = tenant
@@ -3185,7 +3240,7 @@ mod tests {
         }
 
         // check that both of them are initially unloaded
-        let tenant = harness.load().await;
+        let (tenant, _ctx) = harness.load().await;
 
         // check that both, child and ancestor are loaded
         let _child_tline = tenant
@@ -3203,11 +3258,11 @@ mod tests {
     async fn corrupt_metadata() -> anyhow::Result<()> {
         const TEST_NAME: &str = "corrupt_metadata";
         let harness = TenantHarness::create(TEST_NAME)?;
-        let tenant = harness.load().await;
+        let (tenant, ctx) = harness.load().await;
 
         tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
+            .initialize(&ctx)?;
         drop(tenant);
 
         let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
@@ -3219,7 +3274,7 @@ mod tests {
         metadata_bytes[8] ^= 1;
         std::fs::write(metadata_path, metadata_bytes)?;
 
-        let err = harness.try_load().await.err().expect("should fail");
+        let err = harness.try_load(&ctx).await.err().expect("should fail");
         assert!(err
             .to_string()
             .starts_with("Failed to parse metadata bytes from path"));
@@ -3243,10 +3298,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_images() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_images")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) = TenantHarness::create("test_images")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
@@ -3254,7 +3308,7 @@ mod tests {
         drop(writer);
 
         tline.freeze_and_flush().await?;
-        tline.compact().await?;
+        tline.compact(&ctx).await?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
@@ -3262,7 +3316,7 @@ mod tests {
         drop(writer);
 
         tline.freeze_and_flush().await?;
-        tline.compact().await?;
+        tline.compact(&ctx).await?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?;
@@ -3270,7 +3324,7 @@ mod tests {
         drop(writer);
 
         tline.freeze_and_flush().await?;
-        tline.compact().await?;
+        tline.compact(&ctx).await?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?;
@@ -3278,26 +3332,26 @@ mod tests {
         drop(writer);
 
         tline.freeze_and_flush().await?;
-        tline.compact().await?;
+        tline.compact(&ctx).await?;
 
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x10)).await?,
+            tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
             TEST_IMG("foo at 0x10")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x1f)).await?,
+            tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
             TEST_IMG("foo at 0x10")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x20)).await?,
+            tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
             TEST_IMG("foo at 0x20")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x30)).await?,
+            tline.get(*TEST_KEY, Lsn(0x30), &ctx).await?,
             TEST_IMG("foo at 0x30")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x40)).await?,
+            tline.get(*TEST_KEY, Lsn(0x40), &ctx).await?,
             TEST_IMG("foo at 0x40")
         );
 
@@ -3310,10 +3364,9 @@ mod tests {
     //
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_bulk_insert")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
 
         let mut lsn = Lsn(0x10);
 
@@ -3342,10 +3395,10 @@ mod tests {
             let cutoff = tline.get_last_record_lsn();
 
             tline
-                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
                 .await?;
             tline.freeze_and_flush().await?;
-            tline.compact().await?;
+            tline.compact(&ctx).await?;
             tline.gc().await?;
         }
 
@@ -3354,10 +3407,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_random_updates")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
 
         const NUM_KEYS: usize = 1000;
 
@@ -3407,7 +3459,7 @@ mod tests {
             for (blknum, last_lsn) in updated.iter().enumerate() {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, lsn).await?,
+                    tline.get(test_key, lsn, &ctx).await?,
                     TEST_IMG(&format!("{} at {}", blknum, last_lsn))
                 );
             }
@@ -3415,10 +3467,10 @@ mod tests {
             // Perform a cycle of flush, compact, and GC
             let cutoff = tline.get_last_record_lsn();
             tline
-                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
                 .await?;
             tline.freeze_and_flush().await?;
-            tline.compact().await?;
+            tline.compact(&ctx).await?;
             tline.gc().await?;
         }
 
@@ -3427,12 +3479,12 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_branches() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_traverse_branches")?
+        let (tenant, ctx) = TenantHarness::create("test_traverse_branches")?
             .load()
             .await;
         let mut tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
+            .initialize(&ctx)?;
 
         const NUM_KEYS: usize = 1000;
 
@@ -3462,16 +3514,14 @@ mod tests {
             keyspace.add_key(test_key);
         }
 
-        let mut tline_id = TIMELINE_ID;
         for _ in 0..50 {
             let new_tline_id = TimelineId::generate();
             tenant
-                .branch_timeline(tline_id, new_tline_id, Some(lsn))
+                .branch_timeline(&tline, new_tline_id, Some(lsn), &ctx)
                 .await?;
             tline = tenant
                 .get_timeline(new_tline_id, true)
                 .expect("Should have the branched timeline");
-            tline_id = new_tline_id;
 
             for _ in 0..NUM_KEYS {
                 lsn = Lsn(lsn.0 + 0x10);
@@ -3493,7 +3543,7 @@ mod tests {
             for (blknum, last_lsn) in updated.iter().enumerate() {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, lsn).await?,
+                    tline.get(test_key, lsn, &ctx).await?,
                     TEST_IMG(&format!("{} at {}", blknum, last_lsn))
                 );
             }
@@ -3501,10 +3551,10 @@ mod tests {
             // Perform a cycle of flush, compact, and GC
             let cutoff = tline.get_last_record_lsn();
             tline
-                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
                 .await?;
             tline.freeze_and_flush().await?;
-            tline.compact().await?;
+            tline.compact(&ctx).await?;
             tline.gc().await?;
         }
 
@@ -3513,12 +3563,12 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_ancestors() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_traverse_ancestors")?
+        let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")?
             .load()
             .await;
         let mut tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
+            .initialize(&ctx)?;
 
         const NUM_KEYS: usize = 100;
         const NUM_TLINES: usize = 50;
@@ -3528,18 +3578,16 @@ mod tests {
         let mut updated = [[Lsn(0); NUM_KEYS]; NUM_TLINES];
 
         let mut lsn = Lsn(0);
-        let mut tline_id = TIMELINE_ID;
 
         #[allow(clippy::needless_range_loop)]
         for idx in 0..NUM_TLINES {
             let new_tline_id = TimelineId::generate();
             tenant
-                .branch_timeline(tline_id, new_tline_id, Some(lsn))
+                .branch_timeline(&tline, new_tline_id, Some(lsn), &ctx)
                 .await?;
             tline = tenant
                 .get_timeline(new_tline_id, true)
                 .expect("Should have the branched timeline");
-            tline_id = new_tline_id;
 
             for _ in 0..NUM_KEYS {
                 lsn = Lsn(lsn.0 + 0x10);
@@ -3568,7 +3616,7 @@ mod tests {
                 println!("checking [{idx}][{blknum}] at {lsn}");
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, *lsn).await?,
+                    tline.get(test_key, *lsn, &ctx).await?,
                     TEST_IMG(&format!("{idx} {blknum} at {lsn}"))
                 );
             }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index c95a98fbc7..e66ee0ae36 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -28,7 +28,12 @@ pub mod defaults {
     pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
 
     pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
-    pub const DEFAULT_GC_PERIOD: &str = "100 s";
+
+    // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
+    // If there's a need to decrease this value, first make sure that GC
+    // doesn't hold a layer map write lock for non-trivial operations.
+    // Relevant: https://github.com/neondatabase/neon/issues/3394
+    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
     pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
     pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
     pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 01c5359e88..ed1a32c8fd 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -9,24 +9,57 @@
 //! are frozen, and it is split up into new image and delta layers and the
 //! corresponding files are written to disk.
 //!
+//! Design overview:
+//!
+//! The `search` method of the layer map is on the read critical path, so we've
+//! built an efficient data structure for fast reads, stored in `LayerMap::historic`.
+//! Other read methods are less critical but still impact performance of background tasks.
+//!
+//! This data structure relies on a persistent/immutable binary search tree. See the
+//! following lecture for an introduction https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
+//! Summary: A persistent/immutable BST (and persistent data structures in general) allows
+//! you to modify the tree in such a way that each modification creates a new "version"
+//! of the tree. When you modify it, you get a new version, but all previous versions are
+//! still accessible too. So if someone is still holding a reference to an older version,
+//! they continue to see the tree as it was then. The persistent BST stores all the
+//! different versions in an efficient way.
+//!
+//! Our persistent BST maintains a map of which layer file "covers" each key. It has only
+//! one dimension, the key. See `layer_coverage.rs`. We use the persistent/immutable property
+//! to handle the LSN dimension.
+//!
+//! To build the layer map, we insert each layer to the persistent BST in LSN.start order,
+//! starting from the oldest one. After each insertion, we grab a reference to that "version"
+//! of the tree, and store it in another tree, a BtreeMap keyed by the LSN. See
+//! `historic_layer_coverage.rs`.
+//!
+//! To search for a particular key-LSN pair, you first look up the right "version" in the
+//! BTreeMap. Then you search that version of the BST with the key.
+//!
+//! The persistent BST keeps all the versions, but there is no way to change the old versions
+//! afterwards. We can add layers as long as they have larger LSNs than any previous layer in
+//! the map, but if we need to remove a layer, or insert anything with an older LSN, we need
+//! to throw away most of the persistent BST and build a new one, starting from the oldest
+//! LSN. See `LayerMap::flush_updates()`.
+//!
 
+mod historic_layer_coverage;
+mod layer_coverage;
+
+use crate::keyspace::KeyPartitioning;
 use crate::metrics::NUM_ONDISK_LAYERS;
 use crate::repository::Key;
-use crate::tenant::storage_layer::{range_eq, range_overlaps};
-use amplify_num::i256;
+use crate::tenant::storage_layer::InMemoryLayer;
+use crate::tenant::storage_layer::Layer;
 use anyhow::Result;
-use num_traits::identities::{One, Zero};
-use num_traits::{Bounded, Num, Signed};
-use rstar::{RTree, RTreeObject, AABB};
-use std::cmp::Ordering;
 use std::collections::VecDeque;
 use std::ops::Range;
-use std::ops::{Add, Div, Mul, Neg, Rem, Sub};
 use std::sync::Arc;
-use tracing::*;
 use utils::lsn::Lsn;
 
-use super::storage_layer::{InMemoryLayer, Layer};
+use historic_layer_coverage::BufferedHistoricLayerCoverage;
+
+use super::storage_layer::range_eq;
 
 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -51,8 +84,8 @@ pub struct LayerMap<L: ?Sized> {
     ///
     pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
 
-    /// All the historic layers are kept here
-    historic_layers: RTree<LayerRTreeObject<L>>,
+    /// Index of the historic layers optimized for search
+    historic: BufferedHistoricLayerCoverage<Arc<L>>,
 
     /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
     /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
@@ -65,177 +98,64 @@ impl<L: ?Sized> Default for LayerMap<L> {
             open_layer: None,
             next_open_layer_at: None,
             frozen_layers: VecDeque::default(),
-            historic_layers: RTree::default(),
             l0_delta_layers: Vec::default(),
+            historic: BufferedHistoricLayerCoverage::default(),
         }
     }
 }
 
-struct LayerRTreeObject<L: ?Sized> {
-    layer: Arc<L>,
-
-    envelope: AABB<[IntKey; 2]>,
+/// The primary update API for the layer map.
+///
+/// Batching historic layer insertions and removals is good for
+/// performance and this struct helps us do that correctly.
+#[must_use]
+pub struct BatchedUpdates<'a, L: ?Sized + Layer> {
+    // While we hold this exclusive reference to the layer map the type checker
+    // will prevent us from accidentally reading any unflushed updates.
+    layer_map: &'a mut LayerMap<L>,
 }
 
-// Representation of Key as numeric type.
-// We can not use native implementation of i128, because rstar::RTree
-// doesn't handle properly integer overflow during area calculation: sum(Xi*Yi).
-// Overflow will cause panic in debug mode and incorrect area calculation in release mode,
-// which leads to non-optimally balanced R-Tree (but doesn't fit correctness of R-Tree work).
-// By using i256 as the type, even though all the actual values would fit in i128, we can be
-// sure that multiplication doesn't overflow.
-//
-
-#[derive(Clone, PartialEq, Eq, PartialOrd, Debug)]
-struct IntKey(i256);
-
-impl Copy for IntKey {}
-
-impl IntKey {
-    fn from(i: i128) -> Self {
-        IntKey(i256::from(i))
-    }
-}
-
-impl Bounded for IntKey {
-    fn min_value() -> Self {
-        IntKey(i256::MIN)
-    }
-    fn max_value() -> Self {
-        IntKey(i256::MAX)
-    }
-}
-
-impl Signed for IntKey {
-    fn is_positive(&self) -> bool {
-        self.0 > i256::ZERO
-    }
-    fn is_negative(&self) -> bool {
-        self.0 < i256::ZERO
-    }
-    fn signum(&self) -> Self {
-        match self.0.cmp(&i256::ZERO) {
-            Ordering::Greater => IntKey(i256::ONE),
-            Ordering::Less => IntKey(-i256::ONE),
-            Ordering::Equal => IntKey(i256::ZERO),
-        }
-    }
-    fn abs(&self) -> Self {
-        IntKey(self.0.abs())
-    }
-    fn abs_sub(&self, other: &Self) -> Self {
-        if self.0 <= other.0 {
-            IntKey(i256::ZERO)
-        } else {
-            IntKey(self.0 - other.0)
-        }
-    }
-}
-
-impl Neg for IntKey {
-    type Output = Self;
-    fn neg(self) -> Self::Output {
-        IntKey(-self.0)
-    }
-}
-
-impl Rem for IntKey {
-    type Output = Self;
-    fn rem(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 % rhs.0)
-    }
-}
-
-impl Div for IntKey {
-    type Output = Self;
-    fn div(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 / rhs.0)
-    }
-}
-
-impl Add for IntKey {
-    type Output = Self;
-    fn add(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 + rhs.0)
-    }
-}
-
-impl Sub for IntKey {
-    type Output = Self;
-    fn sub(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 - rhs.0)
-    }
-}
-
-impl Mul for IntKey {
-    type Output = Self;
-    fn mul(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 * rhs.0)
-    }
-}
-
-impl One for IntKey {
-    fn one() -> Self {
-        IntKey(i256::ONE)
-    }
-}
-
-impl Zero for IntKey {
-    fn zero() -> Self {
-        IntKey(i256::ZERO)
-    }
-    fn is_zero(&self) -> bool {
-        self.0 == i256::ZERO
-    }
-}
-
-impl Num for IntKey {
-    type FromStrRadixErr = <i128 as Num>::FromStrRadixErr;
-    fn from_str_radix(str: &str, radix: u32) -> Result<Self, Self::FromStrRadixErr> {
-        Ok(IntKey(i256::from(i128::from_str_radix(str, radix)?)))
-    }
-}
-
-impl<T: ?Sized> PartialEq for LayerRTreeObject<T> {
-    fn eq(&self, other: &Self) -> bool {
-        // FIXME: ptr_eq might fail to return true for 'dyn'
-        // references.  Clippy complains about this. In practice it
-        // seems to work, the assertion below would be triggered
-        // otherwise but this ought to be fixed.
-        #[allow(clippy::vtable_address_comparisons)]
-        Arc::ptr_eq(&self.layer, &other.layer)
-    }
-}
-
-impl<L> RTreeObject for LayerRTreeObject<L>
-where
-    L: ?Sized,
-{
-    type Envelope = AABB<[IntKey; 2]>;
-    fn envelope(&self) -> Self::Envelope {
-        self.envelope
-    }
-}
-
-impl<L> LayerRTreeObject<L>
+/// Provide ability to batch more updates while hiding the read
+/// API so we don't accidentally read without flushing.
+impl<L> BatchedUpdates<'_, L>
 where
     L: ?Sized + Layer,
 {
-    fn new(layer: Arc<L>) -> Self {
-        let key_range = layer.get_key_range();
-        let lsn_range = layer.get_lsn_range();
+    ///
+    /// Insert an on-disk layer.
+    ///
+    pub fn insert_historic(&mut self, layer: Arc<L>) {
+        self.layer_map.insert_historic_noflush(layer)
+    }
 
-        let envelope = AABB::from_corners(
-            [
-                IntKey::from(key_range.start.to_i128()),
-                IntKey::from(lsn_range.start.0 as i128),
-            ],
-            [
-                IntKey::from(key_range.end.to_i128() - 1),
-                IntKey::from(lsn_range.end.0 as i128 - 1),
-            ], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive
-        );
-        LayerRTreeObject { layer, envelope }
+    ///
+    /// Remove an on-disk layer from the map.
+    ///
+    /// This should be called when the corresponding file on disk has been deleted.
+    ///
+    pub fn remove_historic(&mut self, layer: Arc<L>) {
+        self.layer_map.remove_historic_noflush(layer)
+    }
+
+    // We will flush on drop anyway, but this method makes it
+    // more explicit that there is some work being done.
+    /// Apply all updates
+    pub fn flush(self) {
+        // Flush happens on drop
+    }
+}
+
+// Ideally the flush() method should be called explicitly for more
+// controlled execution. But if we forget we'd rather flush on drop
+// than panic later or read without flushing.
+//
+// TODO maybe warn if flush hasn't explicitly been called
+impl<L> Drop for BatchedUpdates<'_, L>
+where
+    L: ?Sized + Layer,
+{
+    fn drop(&mut self) {
+        self.layer_map.flush_updates();
     }
 }
 
@@ -281,125 +201,91 @@ where
     /// 'open' and 'frozen' layers!
     ///
     pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
-        // Find the latest image layer that covers the given key
-        let mut latest_img: Option<Arc<L>> = None;
-        let mut latest_img_lsn: Option<Lsn> = None;
-        let envelope = AABB::from_corners(
-            [IntKey::from(key.to_i128()), IntKey::from(0i128)],
-            [
-                IntKey::from(key.to_i128()),
-                IntKey::from(end_lsn.0 as i128 - 1),
-            ],
-        );
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            if l.is_incremental() {
-                continue;
-            }
-            assert!(l.get_key_range().contains(&key));
-            let img_lsn = l.get_lsn_range().start;
-            assert!(img_lsn < end_lsn);
-            if Lsn(img_lsn.0 + 1) == end_lsn {
-                // found exact match
-                return Some(SearchResult {
-                    layer: Arc::clone(l),
-                    lsn_floor: img_lsn,
-                });
-            }
-            if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) {
-                latest_img = Some(Arc::clone(l));
-                latest_img_lsn = Some(img_lsn);
-            }
-        }
+        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
+        let latest_delta = version.delta_coverage.query(key.to_i128());
+        let latest_image = version.image_coverage.query(key.to_i128());
 
-        // Search the delta layers
-        let mut latest_delta: Option<Arc<L>> = None;
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            if !l.is_incremental() {
-                continue;
+        match (latest_delta, latest_image) {
+            (None, None) => None,
+            (None, Some(image)) => {
+                let lsn_floor = image.get_lsn_range().start;
+                Some(SearchResult {
+                    layer: image,
+                    lsn_floor,
+                })
             }
-            assert!(l.get_key_range().contains(&key));
-            if l.get_lsn_range().start >= end_lsn {
-                info!(
-                    "Candidate delta layer {}..{} is too new for lsn {}",
-                    l.get_lsn_range().start,
-                    l.get_lsn_range().end,
-                    end_lsn
-                );
+            (Some(delta), None) => {
+                let lsn_floor = delta.get_lsn_range().start;
+                Some(SearchResult {
+                    layer: delta,
+                    lsn_floor,
+                })
             }
-            assert!(l.get_lsn_range().start < end_lsn);
-            if l.get_lsn_range().end >= end_lsn {
-                // this layer contains the requested point in the key/lsn space.
-                // No need to search any further
-                trace!(
-                    "found layer {} for request on {key} at {end_lsn}",
-                    l.short_id(),
-                );
-                latest_delta.replace(Arc::clone(l));
-                break;
-            }
-            if l.get_lsn_range().end > latest_img_lsn.unwrap_or(Lsn(0)) {
-                // this layer's end LSN is smaller than the requested point. If there's
-                // nothing newer, this is what we need to return. Remember this.
-                if let Some(old_candidate) = &latest_delta {
-                    if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
-                        latest_delta.replace(Arc::clone(l));
-                    }
+            (Some(delta), Some(image)) => {
+                let img_lsn = image.get_lsn_range().start;
+                let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
+                let image_exact_match = img_lsn + 1 == end_lsn;
+                if image_is_newer || image_exact_match {
+                    Some(SearchResult {
+                        layer: image,
+                        lsn_floor: img_lsn,
+                    })
                 } else {
-                    latest_delta.replace(Arc::clone(l));
+                    let lsn_floor =
+                        std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
+                    Some(SearchResult {
+                        layer: delta,
+                        lsn_floor,
+                    })
                 }
             }
         }
-        if let Some(l) = latest_delta {
-            trace!(
-                "found (old) layer {} for request on {key} at {end_lsn}",
-                l.short_id(),
-            );
-            let lsn_floor = std::cmp::max(
-                Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
-                l.get_lsn_range().start,
-            );
-            Some(SearchResult {
-                lsn_floor,
-                layer: l,
-            })
-        } else if let Some(l) = latest_img {
-            trace!("found img layer and no deltas for request on {key} at {end_lsn}");
-            Some(SearchResult {
-                lsn_floor: latest_img_lsn.unwrap(),
-                layer: l,
-            })
-        } else {
-            trace!("no layer found for request on {key} at {end_lsn}");
-            None
-        }
+    }
+
+    /// Start a batch of updates, applied on drop
+    pub fn batch_update(&mut self) -> BatchedUpdates<'_, L> {
+        BatchedUpdates { layer_map: self }
     }
 
     ///
     /// Insert an on-disk layer
     ///
-    pub fn insert_historic(&mut self, layer: Arc<L>) {
-        if layer.get_key_range() == (Key::MIN..Key::MAX) {
-            self.l0_delta_layers.push(layer.clone());
+    /// Helper function for BatchedUpdates::insert_historic
+    ///
+    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
+        let kr = layer.get_key_range();
+        let lr = layer.get_lsn_range();
+        self.historic.insert(
+            historic_layer_coverage::LayerKey {
+                key: kr.start.to_i128()..kr.end.to_i128(),
+                lsn: lr.start.0..lr.end.0,
+                is_image: !layer.is_incremental(),
+            },
+            Arc::clone(&layer),
+        );
+
+        if Self::is_l0(&layer) {
+            self.l0_delta_layers.push(layer);
         }
-        self.historic_layers.insert(LayerRTreeObject::new(layer));
+
         NUM_ONDISK_LAYERS.inc();
     }
 
     ///
     /// Remove an on-disk layer from the map.
     ///
-    /// This should be called when the corresponding file on disk has been deleted.
+    /// Helper function for BatchedUpdates::remove_historic
     ///
-    pub fn remove_historic(&mut self, layer: Arc<L>) {
-        if layer.get_key_range() == (Key::MIN..Key::MAX) {
+    pub fn remove_historic_noflush(&mut self, layer: Arc<L>) {
+        let kr = layer.get_key_range();
+        let lr = layer.get_lsn_range();
+        self.historic.remove(historic_layer_coverage::LayerKey {
+            key: kr.start.to_i128()..kr.end.to_i128(),
+            lsn: lr.start.0..lr.end.0,
+            is_image: !layer.is_incremental(),
+        });
+
+        if Self::is_l0(&layer) {
             let len_before = self.l0_delta_layers.len();
 
             // FIXME: ptr_eq might fail to return true for 'dyn'
@@ -411,98 +297,57 @@ where
                 .retain(|other| !Arc::ptr_eq(other, &layer));
             assert_eq!(self.l0_delta_layers.len(), len_before - 1);
         }
-        assert!(self
-            .historic_layers
-            .remove(&LayerRTreeObject::new(layer))
-            .is_some());
+
         NUM_ONDISK_LAYERS.dec();
     }
 
+    /// Helper function for BatchedUpdates::drop.
+    pub(self) fn flush_updates(&mut self) {
+        self.historic.rebuild();
+    }
+
     /// Is there a newer image layer for given key- and LSN-range? Or a set
     /// of image layers within the specified lsn range that cover the entire
     /// specified key range?
     ///
     /// This is used for garbage collection, to determine if an old layer can
     /// be deleted.
-    pub fn image_layer_exists(
-        &self,
-        key_range: &Range<Key>,
-        lsn_range: &Range<Lsn>,
-    ) -> Result<bool> {
-        let mut range_remain = key_range.clone();
+    pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> Result<bool> {
+        if key.is_empty() {
+            // Vacuously true. There's a newer image for all 0 of the kerys in the range.
+            return Ok(true);
+        }
 
-        loop {
-            let mut made_progress = false;
-            let envelope = AABB::from_corners(
-                [
-                    IntKey::from(range_remain.start.to_i128()),
-                    IntKey::from(lsn_range.start.0 as i128),
-                ],
-                [
-                    IntKey::from(range_remain.end.to_i128() - 1),
-                    IntKey::from(lsn_range.end.0 as i128 - 1),
-                ],
-            );
-            for e in self
-                .historic_layers
-                .locate_in_envelope_intersecting(&envelope)
-            {
-                let l = &e.layer;
-                if l.is_incremental() {
-                    continue;
-                }
-                let img_lsn = l.get_lsn_range().start;
-                if l.get_key_range().contains(&range_remain.start) && lsn_range.contains(&img_lsn) {
-                    made_progress = true;
-                    let img_key_end = l.get_key_range().end;
+        let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
+            Some(v) => v,
+            None => return Ok(false),
+        };
 
-                    if img_key_end >= range_remain.end {
-                        return Ok(true);
-                    }
-                    range_remain.start = img_key_end;
-                }
-            }
+        let start = key.start.to_i128();
+        let end = key.end.to_i128();
 
-            if !made_progress {
+        let layer_covers = |layer: Option<Arc<L>>| match layer {
+            Some(layer) => layer.get_lsn_range().start >= lsn.start,
+            None => false,
+        };
+
+        // Check the start is covered
+        if !layer_covers(version.image_coverage.query(start)) {
+            return Ok(false);
+        }
+
+        // Check after all changes of coverage
+        for (_, change_val) in version.image_coverage.range(start..end) {
+            if !layer_covers(change_val) {
                 return Ok(false);
             }
         }
+
+        Ok(true)
     }
 
     pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
-        self.historic_layers.iter().map(|e| e.layer.clone())
-    }
-
-    /// Find the last image layer that covers 'key', ignoring any image layers
-    /// newer than 'lsn'.
-    fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<L>> {
-        let mut candidate_lsn = Lsn(0);
-        let mut candidate = None;
-        let envelope = AABB::from_corners(
-            [IntKey::from(key.to_i128()), IntKey::from(0)],
-            [IntKey::from(key.to_i128()), IntKey::from(lsn.0 as i128)],
-        );
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            if l.is_incremental() {
-                continue;
-            }
-
-            assert!(l.get_key_range().contains(&key));
-            let this_lsn = l.get_lsn_range().start;
-            assert!(this_lsn <= lsn);
-            if this_lsn < candidate_lsn {
-                // our previous candidate was better
-                continue;
-            }
-            candidate_lsn = this_lsn;
-            candidate = Some(Arc::clone(l));
-        }
-
-        candidate
+        self.historic.iter()
     }
 
     ///
@@ -518,94 +363,288 @@ where
         key_range: &Range<Key>,
         lsn: Lsn,
     ) -> Result<Vec<(Range<Key>, Option<Arc<L>>)>> {
-        let mut points = vec![key_range.start];
-        let envelope = AABB::from_corners(
-            [IntKey::from(key_range.start.to_i128()), IntKey::from(0)],
-            [
-                IntKey::from(key_range.end.to_i128()),
-                IntKey::from(lsn.0 as i128),
-            ],
-        );
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            assert!(l.get_lsn_range().start <= lsn);
-            let range = l.get_key_range();
-            if key_range.contains(&range.start) {
-                points.push(l.get_key_range().start);
-            }
-            if key_range.contains(&range.end) {
-                points.push(l.get_key_range().end);
-            }
+        let version = match self.historic.get().unwrap().get_version(lsn.0) {
+            Some(v) => v,
+            None => return Ok(vec![]),
+        };
+
+        let start = key_range.start.to_i128();
+        let end = key_range.end.to_i128();
+
+        // Initialize loop variables
+        let mut coverage: Vec<(Range<Key>, Option<Arc<L>>)> = vec![];
+        let mut current_key = start;
+        let mut current_val = version.image_coverage.query(start);
+
+        // Loop through the change events and push intervals
+        for (change_key, change_val) in version.image_coverage.range(start..end) {
+            let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
+            coverage.push((kr, current_val.take()));
+            current_key = change_key;
+            current_val = change_val.clone();
         }
-        points.push(key_range.end);
 
-        points.sort();
-        points.dedup();
+        // Add the final interval
+        let kr = Key::from_i128(current_key)..Key::from_i128(end);
+        coverage.push((kr, current_val.take()));
 
-        // Ok, we now have a list of "interesting" points in the key space
-
-        // For each range between the points, find the latest image
-        let mut start = *points.first().unwrap();
-        let mut ranges = Vec::new();
-        for end in points[1..].iter() {
-            let img = self.find_latest_image(start, lsn);
-
-            ranges.push((start..*end, img));
-
-            start = *end;
-        }
-        Ok(ranges)
+        Ok(coverage)
     }
 
-    /// Count the height of the tallest stack of deltas in this 2d region.
+    pub fn is_l0(layer: &L) -> bool {
+        range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX))
+    }
+
+    /// This function determines which layers are counted in `count_deltas`:
+    /// layers that should count towards deciding whether or not to reimage
+    /// a certain partition range.
+    ///
+    /// There are two kinds of layers we currently consider reimage-worthy:
+    ///
+    /// Case 1: Non-L0 layers are currently reimage-worthy by default.
+    /// TODO Some of these layers are very sparse and cover the entire key
+    ///      range. Replacing 256MB of data (or less!) with terabytes of
+    ///      images doesn't seem wise. We need a better heuristic, possibly
+    ///      based on some of these factors:
+    ///      a) whether this layer has any wal in this partition range
+    ///      b) the size of the layer
+    ///      c) the number of images needed to cover it
+    ///      d) the estimated time until we'll have to reimage over it for GC
+    ///
+    /// Case 2: Since L0 layers by definition cover the entire key space, we consider
+    /// them reimage-worthy only when the entire key space can be covered by very few
+    /// images (currently 1).
+    /// TODO The optimal number should probably be slightly higher than 1, but to
+    ///      implement that we need to plumb a lot more context into this function
+    ///      than just the current partition_range.
+    pub fn is_reimage_worthy(layer: &L, partition_range: &Range<Key>) -> bool {
+        // Case 1
+        if !Self::is_l0(layer) {
+            return true;
+        }
+
+        // Case 2
+        if range_eq(partition_range, &(Key::MIN..Key::MAX)) {
+            return true;
+        }
+
+        false
+    }
+
+    /// Count the height of the tallest stack of reimage-worthy deltas
+    /// in this 2d region.
+    ///
+    /// If `limit` is provided we don't try to count above that number.
     ///
     /// This number is used to compute the largest number of deltas that
     /// we'll need to visit for any page reconstruction in this region.
     /// We use this heuristic to decide whether to create an image layer.
-    ///
-    /// TODO currently we just return the total number of deltas in the
-    ///      region, no matter if they're stacked on top of each other
-    ///      or next to each other.
-    pub fn count_deltas(&self, key_range: &Range<Key>, lsn_range: &Range<Lsn>) -> Result<usize> {
-        let mut result = 0;
-        if lsn_range.start >= lsn_range.end {
+    pub fn count_deltas(
+        &self,
+        key: &Range<Key>,
+        lsn: &Range<Lsn>,
+        limit: Option<usize>,
+    ) -> Result<usize> {
+        // We get the delta coverage of the region, and for each part of the coverage
+        // we recurse right underneath the delta. The recursion depth is limited by
+        // the largest result this function could return, which is in practice between
+        // 3 and 10 (since we usually try to create an image when the number gets larger).
+
+        if lsn.is_empty() || key.is_empty() || limit == Some(0) {
             return Ok(0);
         }
-        let envelope = AABB::from_corners(
-            [
-                IntKey::from(key_range.start.to_i128()),
-                IntKey::from(lsn_range.start.0 as i128),
-            ],
-            [
-                IntKey::from(key_range.end.to_i128() - 1),
-                IntKey::from(lsn_range.end.0 as i128 - 1),
-            ],
-        );
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            if !l.is_incremental() {
-                continue;
-            }
-            assert!(range_overlaps(&l.get_lsn_range(), lsn_range));
-            assert!(range_overlaps(&l.get_key_range(), key_range));
 
-            // We ignore level0 delta layers. Unless the whole keyspace fits
-            // into one partition
-            if !range_eq(key_range, &(Key::MIN..Key::MAX))
-                && range_eq(&l.get_key_range(), &(Key::MIN..Key::MAX))
-            {
-                continue;
+        let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
+            Some(v) => v,
+            None => return Ok(0),
+        };
+
+        let start = key.start.to_i128();
+        let end = key.end.to_i128();
+
+        // Initialize loop variables
+        let mut max_stacked_deltas = 0;
+        let mut current_key = start;
+        let mut current_val = version.delta_coverage.query(start);
+
+        // Loop through the delta coverage and recurse on each part
+        for (change_key, change_val) in version.delta_coverage.range(start..end) {
+            // If there's a relevant delta in this part, add 1 and recurse down
+            if let Some(val) = current_val {
+                if val.get_lsn_range().end > lsn.start {
+                    let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
+                    let lr = lsn.start..val.get_lsn_range().start;
+                    if !kr.is_empty() {
+                        let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                        let new_limit = limit.map(|l| l - base_count);
+                        let max_stacked_deltas_underneath =
+                            self.count_deltas(&kr, &lr, new_limit)?;
+                        max_stacked_deltas = std::cmp::max(
+                            max_stacked_deltas,
+                            base_count + max_stacked_deltas_underneath,
+                        );
+                    }
+                }
             }
 
-            result += 1;
+            current_key = change_key;
+            current_val = change_val.clone();
         }
-        Ok(result)
+
+        // Consider the last part
+        if let Some(val) = current_val {
+            if val.get_lsn_range().end > lsn.start {
+                let kr = Key::from_i128(current_key)..Key::from_i128(end);
+                let lr = lsn.start..val.get_lsn_range().start;
+
+                if !kr.is_empty() {
+                    let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                    let new_limit = limit.map(|l| l - base_count);
+                    let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
+                    max_stacked_deltas = std::cmp::max(
+                        max_stacked_deltas,
+                        base_count + max_stacked_deltas_underneath,
+                    );
+                }
+            }
+        }
+
+        Ok(max_stacked_deltas)
+    }
+
+    /// Count how many reimage-worthy layers we need to visit for given key-lsn pair.
+    ///
+    /// The `partition_range` argument is used as context for the reimage-worthiness decision.
+    ///
+    /// Used as a helper for correctness checks only. Performance not critical.
+    pub fn get_difficulty(&self, lsn: Lsn, key: Key, partition_range: &Range<Key>) -> usize {
+        match self.search(key, lsn) {
+            Some(search_result) => {
+                if search_result.layer.is_incremental() {
+                    (Self::is_reimage_worthy(&search_result.layer, partition_range) as usize)
+                        + self.get_difficulty(search_result.lsn_floor, key, partition_range)
+                } else {
+                    0
+                }
+            }
+            None => 0,
+        }
+    }
+
+    /// Used for correctness checking. Results are expected to be identical to
+    /// self.get_difficulty_map. Assumes self.search is correct.
+    pub fn get_difficulty_map_bruteforce(
+        &self,
+        lsn: Lsn,
+        partitioning: &KeyPartitioning,
+    ) -> Vec<usize> {
+        // Looking at the difficulty as a function of key, it could only increase
+        // when a delta layer starts or an image layer ends. Therefore it's sufficient
+        // to check the difficulties at:
+        // - the key.start for each non-empty part range
+        // - the key.start for each delta
+        // - the key.end for each image
+        let keys_iter: Box<dyn Iterator<Item = Key>> = {
+            let mut keys: Vec<Key> = self
+                .iter_historic_layers()
+                .map(|layer| {
+                    if layer.is_incremental() {
+                        layer.get_key_range().start
+                    } else {
+                        layer.get_key_range().end
+                    }
+                })
+                .collect();
+            keys.sort();
+            Box::new(keys.into_iter())
+        };
+        let mut keys_iter = keys_iter.peekable();
+
+        // Iter the partition and keys together and query all the necessary
+        // keys, computing the max difficulty for each part.
+        partitioning
+            .parts
+            .iter()
+            .map(|part| {
+                let mut difficulty = 0;
+                // Partition ranges are assumed to be sorted and disjoint
+                // TODO assert it
+                for range in &part.ranges {
+                    if !range.is_empty() {
+                        difficulty =
+                            std::cmp::max(difficulty, self.get_difficulty(lsn, range.start, range));
+                    }
+                    while let Some(key) = keys_iter.peek() {
+                        if key >= &range.end {
+                            break;
+                        }
+                        let key = keys_iter.next().unwrap();
+                        if key < range.start {
+                            continue;
+                        }
+                        difficulty =
+                            std::cmp::max(difficulty, self.get_difficulty(lsn, key, range));
+                    }
+                }
+                difficulty
+            })
+            .collect()
+    }
+
+    /// For each part of a keyspace partitioning, return the maximum number of layers
+    /// that would be needed for page reconstruction in that part at the given LSN.
+    ///
+    /// If `limit` is provided we don't try to count above that number.
+    ///
+    /// This method is used to decide where to create new image layers. Computing the
+    /// result for the entire partitioning at once allows this function to be more
+    /// efficient, and further optimization is possible by using iterators instead,
+    /// to allow early return.
+    ///
+    /// TODO actually use this method instead of count_deltas. Currently we only use
+    ///      it for benchmarks.
+    pub fn get_difficulty_map(
+        &self,
+        lsn: Lsn,
+        partitioning: &KeyPartitioning,
+        limit: Option<usize>,
+    ) -> Vec<usize> {
+        // TODO This is a naive implementation. Perf improvements to do:
+        // 1. Instead of calling self.image_coverage and self.count_deltas,
+        //    iterate the image and delta coverage only once.
+        partitioning
+            .parts
+            .iter()
+            .map(|part| {
+                let mut difficulty = 0;
+                for range in &part.ranges {
+                    if limit == Some(difficulty) {
+                        break;
+                    }
+                    for (img_range, last_img) in self
+                        .image_coverage(range, lsn)
+                        .expect("why would this err?")
+                    {
+                        if limit == Some(difficulty) {
+                            break;
+                        }
+                        let img_lsn = if let Some(last_img) = last_img {
+                            last_img.get_lsn_range().end
+                        } else {
+                            Lsn(0)
+                        };
+
+                        if img_lsn < lsn {
+                            let num_deltas = self
+                                .count_deltas(&img_range, &(img_lsn..lsn), limit)
+                                .expect("why would this err lol?");
+                            difficulty = std::cmp::max(difficulty, num_deltas);
+                        }
+                    }
+                }
+                difficulty
+            })
+            .collect()
     }
 
     /// Return all L0 delta layers
@@ -629,8 +668,8 @@ where
         }
 
         println!("historic_layers:");
-        for e in self.historic_layers.iter() {
-            e.layer.dump(verbose)?;
+        for layer in self.iter_historic_layers() {
+            layer.dump(verbose)?;
         }
         println!("End dump LayerMap");
         Ok(())
diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
new file mode 100644
index 0000000000..46821aef15
--- /dev/null
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -0,0 +1,583 @@
+use std::collections::BTreeMap;
+use std::ops::Range;
+
+use tracing::info;
+
+use super::layer_coverage::LayerCoverageTuple;
+
+/// Layers in this module are identified and indexed by this data.
+///
+/// This is a helper struct to enable sorting layers by lsn.start.
+///
+/// These three values are enough to uniquely identify a layer, since
+/// a layer is obligated to contain all contents within range, so two
+/// deltas (or images) with the same range have identical content.
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct LayerKey {
+    // TODO I use i128 and u64 because it was easy for prototyping,
+    //      testing, and benchmarking. If we can use the Lsn and Key
+    //      types without overhead that would be preferable.
+    pub key: Range<i128>,
+    pub lsn: Range<u64>,
+    pub is_image: bool,
+}
+
+impl PartialOrd for LayerKey {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for LayerKey {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        // NOTE we really care about comparing by lsn.start first
+        self.lsn
+            .start
+            .cmp(&other.lsn.start)
+            .then(self.lsn.end.cmp(&other.lsn.end))
+            .then(self.key.start.cmp(&other.key.start))
+            .then(self.key.end.cmp(&other.key.end))
+            .then(self.is_image.cmp(&other.is_image))
+    }
+}
+
+/// Efficiently queryable layer coverage for each LSN.
+///
+/// Allows answering layer map queries very efficiently,
+/// but doesn't allow retroactive insertion, which is
+/// sometimes necessary. See BufferedHistoricLayerCoverage.
+pub struct HistoricLayerCoverage<Value> {
+    /// The latest state
+    head: LayerCoverageTuple<Value>,
+
+    /// All previous states
+    historic: BTreeMap<u64, LayerCoverageTuple<Value>>,
+}
+
+impl<T: Clone> Default for HistoricLayerCoverage<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<Value: Clone> HistoricLayerCoverage<Value> {
+    pub fn new() -> Self {
+        Self {
+            head: LayerCoverageTuple::default(),
+            historic: BTreeMap::default(),
+        }
+    }
+
+    /// Add a layer
+    ///
+    /// Panics if new layer has older lsn.start than an existing layer.
+    /// See BufferedHistoricLayerCoverage for a more general insertion method.
+    pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
+        // It's only a persistent map, not a retroactive one
+        if let Some(last_entry) = self.historic.iter().next_back() {
+            let last_lsn = last_entry.0;
+            if layer_key.lsn.start < *last_lsn {
+                panic!("unexpected retroactive insert");
+            }
+        }
+
+        // Insert into data structure
+        if layer_key.is_image {
+            self.head
+                .image_coverage
+                .insert(layer_key.key, layer_key.lsn.clone(), value);
+        } else {
+            self.head
+                .delta_coverage
+                .insert(layer_key.key, layer_key.lsn.clone(), value);
+        }
+
+        // Remember history. Clone is O(1)
+        self.historic.insert(layer_key.lsn.start, self.head.clone());
+    }
+
+    /// Query at a particular LSN, inclusive
+    pub fn get_version(&self, lsn: u64) -> Option<&LayerCoverageTuple<Value>> {
+        match self.historic.range(..=lsn).next_back() {
+            Some((_, v)) => Some(v),
+            None => None,
+        }
+    }
+
+    /// Remove all entries after a certain LSN (inclusive)
+    pub fn trim(&mut self, begin: &u64) {
+        self.historic.split_off(begin);
+        self.head = self
+            .historic
+            .iter()
+            .rev()
+            .next()
+            .map(|(_, v)| v.clone())
+            .unwrap_or_default();
+    }
+}
+
+/// This is the most basic test that demonstrates intended usage.
+/// All layers in this test have height 1.
+#[test]
+fn test_persistent_simple() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 100..101,
+            is_image: true,
+        },
+        "Layer 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 3..9,
+            lsn: 110..111,
+            is_image: true,
+        },
+        "Layer 2".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 5..6,
+            lsn: 120..121,
+            is_image: true,
+        },
+        "Layer 3".to_string(),
+    );
+
+    // After Layer 1 insertion
+    let version = map.get_version(105).unwrap();
+    assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+
+    // After Layer 2 insertion
+    let version = map.get_version(115).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(11), None);
+
+    // After Layer 3 insertion
+    let version = map.get_version(125).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 3".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 2".to_string()));
+}
+
+/// Cover simple off-by-one edge cases
+#[test]
+fn test_off_by_one() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+    map.insert(
+        LayerKey {
+            key: 3..5,
+            lsn: 100..110,
+            is_image: true,
+        },
+        "Layer 1".to_string(),
+    );
+
+    // Check different LSNs
+    let version = map.get_version(99);
+    assert!(version.is_none());
+    let version = map.get_version(100).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+    let version = map.get_version(110).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+
+    // Check different keys
+    let version = map.get_version(105).unwrap();
+    assert_eq!(version.image_coverage.query(2), None);
+    assert_eq!(version.image_coverage.query(3), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(5), None);
+}
+
+/// Cover edge cases where layers begin or end on the same key
+#[test]
+fn test_key_collision() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+
+    map.insert(
+        LayerKey {
+            key: 3..5,
+            lsn: 100..110,
+            is_image: true,
+        },
+        "Layer 10".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 5..8,
+            lsn: 100..110,
+            is_image: true,
+        },
+        "Layer 11".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 3..4,
+            lsn: 200..210,
+            is_image: true,
+        },
+        "Layer 20".to_string(),
+    );
+
+    // Check after layer 11
+    let version = map.get_version(105).unwrap();
+    assert_eq!(version.image_coverage.query(2), None);
+    assert_eq!(
+        version.image_coverage.query(3),
+        Some("Layer 10".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(5),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(7),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(version.image_coverage.query(8), None);
+
+    // Check after layer 20
+    let version = map.get_version(205).unwrap();
+    assert_eq!(version.image_coverage.query(2), None);
+    assert_eq!(
+        version.image_coverage.query(3),
+        Some("Layer 20".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(5),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(7),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(version.image_coverage.query(8), None);
+}
+
+/// Test when rectangles have nontrivial height and possibly overlap
+#[test]
+fn test_persistent_overlapping() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+
+    // Add 3 key-disjoint layers with varying LSN ranges
+    map.insert(
+        LayerKey {
+            key: 1..2,
+            lsn: 100..200,
+            is_image: true,
+        },
+        "Layer 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 4..5,
+            lsn: 110..200,
+            is_image: true,
+        },
+        "Layer 2".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 7..8,
+            lsn: 120..300,
+            is_image: true,
+        },
+        "Layer 3".to_string(),
+    );
+
+    // Add wide and short layer
+    map.insert(
+        LayerKey {
+            key: 0..9,
+            lsn: 130..199,
+            is_image: true,
+        },
+        "Layer 4".to_string(),
+    );
+
+    // Add wide layer taller than some
+    map.insert(
+        LayerKey {
+            key: 0..9,
+            lsn: 140..201,
+            is_image: true,
+        },
+        "Layer 5".to_string(),
+    );
+
+    // Add wide layer taller than all
+    map.insert(
+        LayerKey {
+            key: 0..9,
+            lsn: 150..301,
+            is_image: true,
+        },
+        "Layer 6".to_string(),
+    );
+
+    // After layer 4 insertion
+    let version = map.get_version(135).unwrap();
+    assert_eq!(version.image_coverage.query(0), Some("Layer 4".to_string()));
+    assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(2), Some("Layer 4".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 4".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 4".to_string()));
+
+    // After layer 5 insertion
+    let version = map.get_version(145).unwrap();
+    assert_eq!(version.image_coverage.query(0), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(1), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(2), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 5".to_string()));
+
+    // After layer 6 insertion
+    let version = map.get_version(155).unwrap();
+    assert_eq!(version.image_coverage.query(0), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(1), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(2), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 6".to_string()));
+}
+
+/// Wrapper for HistoricLayerCoverage that allows us to hack around the lack
+/// of support for retroactive insertion by rebuilding the map since the
+/// change.
+///
+/// Why is this needed? We most often insert new layers with newer LSNs,
+/// but during compaction we create layers with non-latest LSN, and during
+/// GC we delete historic layers.
+///
+/// Even though rebuilding is an expensive (N log N) solution to the problem,
+/// it's not critical since we do something equally expensive just to decide
+/// whether or not to create new image layers.
+/// TODO It's not expensive but it's not great to hold a layer map write lock
+///      for that long.
+///
+/// If this becomes an actual bottleneck, one solution would be to build a
+/// segment tree that holds PersistentLayerMaps. Though this would mean that
+/// we take an additional log(N) performance hit for queries, which will probably
+/// still be more critical.
+///
+/// See this for more on persistent and retroactive techniques:
+/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
+pub struct BufferedHistoricLayerCoverage<Value> {
+    /// A persistent layer map that we rebuild when we need to retroactively update
+    historic_coverage: HistoricLayerCoverage<Value>,
+
+    /// We buffer insertion into the PersistentLayerMap to decrease the number of rebuilds.
+    buffer: BTreeMap<LayerKey, Option<Value>>,
+
+    /// All current layers. This is not used for search. Only to make rebuilds easier.
+    layers: BTreeMap<LayerKey, Value>,
+}
+
+impl<T: std::fmt::Debug> std::fmt::Debug for BufferedHistoricLayerCoverage<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("RetroactiveLayerMap")
+            .field("buffer", &self.buffer)
+            .field("layers", &self.layers)
+            .finish()
+    }
+}
+
+impl<T: Clone> Default for BufferedHistoricLayerCoverage<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
+    pub fn new() -> Self {
+        Self {
+            historic_coverage: HistoricLayerCoverage::<Value>::new(),
+            buffer: BTreeMap::new(),
+            layers: BTreeMap::new(),
+        }
+    }
+
+    pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
+        self.buffer.insert(layer_key, Some(value));
+    }
+
+    pub fn remove(&mut self, layer_key: LayerKey) {
+        self.buffer.insert(layer_key, None);
+    }
+
+    pub fn rebuild(&mut self) {
+        // Find the first LSN that needs to be rebuilt
+        let rebuild_since: u64 = match self.buffer.iter().next() {
+            Some((LayerKey { lsn, .. }, _)) => lsn.start,
+            None => return, // No need to rebuild if buffer is empty
+        };
+
+        // Apply buffered updates to self.layers
+        let num_updates = self.buffer.len();
+        self.buffer.retain(|layer_key, layer| {
+            match layer {
+                Some(l) => {
+                    self.layers.insert(layer_key.clone(), l.clone());
+                }
+                None => {
+                    self.layers.remove(layer_key);
+                }
+            };
+            false
+        });
+
+        // Rebuild
+        let mut num_inserted = 0;
+        self.historic_coverage.trim(&rebuild_since);
+        for (layer_key, layer) in self.layers.range(
+            LayerKey {
+                lsn: rebuild_since..0,
+                key: 0..0,
+                is_image: false,
+            }..,
+        ) {
+            self.historic_coverage
+                .insert(layer_key.clone(), layer.clone());
+            num_inserted += 1;
+        }
+
+        // TODO maybe only warn if ratio is at least 10
+        info!(
+            "Rebuilt layer map. Did {} insertions to process a batch of {} updates.",
+            num_inserted, num_updates,
+        )
+    }
+
+    /// Iterate all the layers
+    pub fn iter(&self) -> impl '_ + Iterator<Item = Value> {
+        // NOTE we can actually perform this without rebuilding,
+        //      but it's not necessary for now.
+        if !self.buffer.is_empty() {
+            panic!("rebuild pls")
+        }
+
+        self.layers.values().cloned()
+    }
+
+    /// Return a reference to a queryable map, assuming all updates
+    /// have already been processed using self.rebuild()
+    pub fn get(&self) -> anyhow::Result<&HistoricLayerCoverage<Value>> {
+        // NOTE we error here instead of implicitly rebuilding because
+        //      rebuilding is somewhat expensive.
+        // TODO maybe implicitly rebuild and log/sentry an error?
+        if !self.buffer.is_empty() {
+            anyhow::bail!("rebuild required")
+        }
+
+        Ok(&self.historic_coverage)
+    }
+}
+
+#[test]
+fn test_retroactive_regression_1() {
+    let mut map = BufferedHistoricLayerCoverage::new();
+
+    map.insert(
+        LayerKey {
+            key: 0..21267647932558653966460912964485513215,
+            lsn: 23761336..23761457,
+            is_image: false,
+        },
+        "sdfsdfs".to_string(),
+    );
+
+    map.rebuild();
+
+    let version = map.get().unwrap().get_version(23761457).unwrap();
+    assert_eq!(
+        version.delta_coverage.query(100),
+        Some("sdfsdfs".to_string())
+    );
+}
+
+#[test]
+fn test_retroactive_simple() {
+    let mut map = BufferedHistoricLayerCoverage::new();
+
+    // Append some images in increasing LSN order
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 100..101,
+            is_image: true,
+        },
+        "Image 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 3..9,
+            lsn: 110..111,
+            is_image: true,
+        },
+        "Image 2".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 4..6,
+            lsn: 120..121,
+            is_image: true,
+        },
+        "Image 3".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 8..9,
+            lsn: 120..121,
+            is_image: true,
+        },
+        "Image 4".to_string(),
+    );
+
+    // Add a delta layer out of order
+    map.insert(
+        LayerKey {
+            key: 2..5,
+            lsn: 105..106,
+            is_image: true,
+        },
+        "Delta 1".to_string(),
+    );
+
+    // Rebuild so we can start querying
+    map.rebuild();
+
+    // Query key 4
+    let version = map.get().unwrap().get_version(90);
+    assert!(version.is_none());
+    let version = map.get().unwrap().get_version(102).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 1".to_string()));
+    let version = map.get().unwrap().get_version(107).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Delta 1".to_string()));
+    let version = map.get().unwrap().get_version(115).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 2".to_string()));
+    let version = map.get().unwrap().get_version(125).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 3".to_string()));
+
+    // Remove Image 3
+    map.remove(LayerKey {
+        key: 4..6,
+        lsn: 120..121,
+        is_image: true,
+    });
+    map.rebuild();
+
+    // Check deletion worked
+    let version = map.get().unwrap().get_version(125).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 2".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string()));
+}
diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs
new file mode 100644
index 0000000000..4e3b4516dc
--- /dev/null
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -0,0 +1,154 @@
+use std::ops::Range;
+
+// TODO the `im` crate has 20x more downloads and also has
+// persistent/immutable BTree. It also runs a bit faster but
+// results are not the same on some tests.
+use rpds::RedBlackTreeMapSync;
+
+/// Data structure that can efficiently:
+/// - find the latest layer by lsn.end at a given key
+/// - iterate the latest layers in a key range
+/// - insert layers in non-decreasing lsn.start order
+///
+/// The struct is parameterized over Value for easier
+/// testing, but in practice it's some sort of layer.
+pub struct LayerCoverage<Value> {
+    /// For every change in coverage (as we sweep the key space)
+    /// we store (lsn.end, value).
+    ///
+    /// We use an immutable/persistent tree so that we can keep historic
+    /// versions of this coverage without cloning the whole thing and
+    /// incurring quadratic memory cost. See HistoricLayerCoverage.
+    ///
+    /// We use the Sync version of the map because we want Self to
+    /// be Sync. Using nonsync might be faster, if we can work with
+    /// that.
+    nodes: RedBlackTreeMapSync<i128, Option<(u64, Value)>>,
+}
+
+impl<T: Clone> Default for LayerCoverage<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<Value: Clone> LayerCoverage<Value> {
+    pub fn new() -> Self {
+        Self {
+            nodes: RedBlackTreeMapSync::default(),
+        }
+    }
+
+    /// Helper function to subdivide the key range without changing any values
+    ///
+    /// Complexity: O(log N)
+    fn add_node(&mut self, key: i128) {
+        let value = match self.nodes.range(..=key).last() {
+            Some((_, Some(v))) => Some(v.clone()),
+            Some((_, None)) => None,
+            None => None,
+        };
+        self.nodes.insert_mut(key, value);
+    }
+
+    /// Insert a layer.
+    ///
+    /// Complexity: worst case O(N), in practice O(log N). See NOTE in implementation.
+    pub fn insert(&mut self, key: Range<i128>, lsn: Range<u64>, value: Value) {
+        // Add nodes at endpoints
+        //
+        // NOTE The order of lines is important. We add nodes at the start
+        // and end of the key range **before updating any nodes** in order
+        // to pin down the current coverage outside of the relevant key range.
+        // Only the coverage inside the layer's key range should change.
+        self.add_node(key.start);
+        self.add_node(key.end);
+
+        // Raise the height where necessary
+        //
+        // NOTE This loop is worst case O(N), but amortized O(log N) in the special
+        // case when rectangles have no height. In practice I don't think we'll see
+        // the kind of layer intersections needed to trigger O(N) behavior. The worst
+        // case is N/2 horizontal layers overlapped with N/2 vertical layers in a
+        // grid pattern.
+        let mut to_update = Vec::new();
+        let mut to_remove = Vec::new();
+        let mut prev_covered = false;
+        for (k, node) in self.nodes.range(key.clone()) {
+            let needs_cover = match node {
+                None => true,
+                Some((h, _)) => h < &lsn.end,
+            };
+            if needs_cover {
+                match prev_covered {
+                    true => to_remove.push(*k),
+                    false => to_update.push(*k),
+                }
+            }
+            prev_covered = needs_cover;
+        }
+        if !prev_covered {
+            to_remove.push(key.end);
+        }
+        for k in to_update {
+            self.nodes.insert_mut(k, Some((lsn.end, value.clone())));
+        }
+        for k in to_remove {
+            self.nodes.remove_mut(&k);
+        }
+    }
+
+    /// Get the latest (by lsn.end) layer at a given key
+    ///
+    /// Complexity: O(log N)
+    pub fn query(&self, key: i128) -> Option<Value> {
+        self.nodes
+            .range(..=key)
+            .rev()
+            .next()?
+            .1
+            .as_ref()
+            .map(|(_, v)| v.clone())
+    }
+
+    /// Iterate the changes in layer coverage in a given range. You will likely
+    /// want to start with self.query(key.start), and then follow up with self.range
+    ///
+    /// Complexity: O(log N + result_size)
+    pub fn range(&self, key: Range<i128>) -> impl '_ + Iterator<Item = (i128, Option<Value>)> {
+        self.nodes
+            .range(key)
+            .map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone())))
+    }
+
+    /// O(1) clone
+    pub fn clone(&self) -> Self {
+        Self {
+            nodes: self.nodes.clone(),
+        }
+    }
+}
+
+/// Image and delta coverage at a specific LSN.
+pub struct LayerCoverageTuple<Value> {
+    pub image_coverage: LayerCoverage<Value>,
+    pub delta_coverage: LayerCoverage<Value>,
+}
+
+impl<T: Clone> Default for LayerCoverageTuple<T> {
+    fn default() -> Self {
+        Self {
+            image_coverage: LayerCoverage::default(),
+            delta_coverage: LayerCoverage::default(),
+        }
+    }
+}
+
+impl<Value: Clone> LayerCoverageTuple<Value> {
+    pub fn clone(&self) -> Self {
+        Self {
+            image_coverage: self.image_coverage.clone(),
+            delta_coverage: self.delta_coverage.clone(),
+        }
+    }
+}
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index dce7cd8bae..a9edee3794 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -16,6 +16,7 @@ use remote_storage::GenericRemoteStorage;
 use utils::crashsafe;
 
 use crate::config::PageServerConf;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{Tenant, TenantState};
@@ -24,8 +25,35 @@ use crate::IGNORED_TENANT_FILE_NAME;
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};
 
-static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
-    Lazy::new(|| RwLock::new(HashMap::new()));
+/// The tenants known to the pageserver.
+/// The enum variants are used to distinguish the different states that the pageserver can be in.
+enum TenantsMap {
+    /// [`init_tenant_mgr`] is not done yet.
+    Initializing,
+    /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
+    /// New tenants can be added using [`tenant_map_insert`].
+    Open(HashMap<TenantId, Arc<Tenant>>),
+    /// The pageserver has entered shutdown mode via [`shutdown_all_tenants`].
+    /// Existing tenants are still accessible, but no new tenants can be created.
+    ShuttingDown(HashMap<TenantId, Arc<Tenant>>),
+}
+
+impl TenantsMap {
+    fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
+        match self {
+            TenantsMap::Initializing => None,
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id),
+        }
+    }
+    fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
+        match self {
+            TenantsMap::Initializing => None,
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
+        }
+    }
+}
+
+static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));
 
 /// Initialize repositories with locally available timelines.
 /// Timelines that are only partially available locally (remote storage has more data than this pageserver)
@@ -36,13 +64,16 @@ pub async fn init_tenant_mgr(
     remote_storage: Option<GenericRemoteStorage>,
 ) -> anyhow::Result<()> {
     // Scan local filesystem for attached tenants
-    let mut number_of_tenants = 0;
     let tenants_dir = conf.tenants_path();
 
+    let mut tenants = HashMap::new();
+
     let mut dir_entries = fs::read_dir(&tenants_dir)
         .await
         .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
 
+    let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
+
     loop {
         match dir_entries.next_entry().await {
             Ok(None) => break,
@@ -86,10 +117,10 @@ pub async fn init_tenant_mgr(
                         conf,
                         &tenant_dir_path,
                         remote_storage.clone(),
+                        &ctx,
                     ) {
                         Ok(tenant) => {
-                            TENANTS.write().await.insert(tenant.tenant_id(), tenant);
-                            number_of_tenants += 1;
+                            tenants.insert(tenant.tenant_id(), tenant);
                         }
                         Err(e) => {
                             error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}");
@@ -108,7 +139,11 @@ pub async fn init_tenant_mgr(
         }
     }
 
-    info!("Processed {number_of_tenants} local tenants at startup");
+    info!("Processed {} local tenants at startup", tenants.len());
+
+    let mut tenants_map = TENANTS.write().await;
+    assert!(matches!(&*tenants_map, &TenantsMap::Initializing));
+    *tenants_map = TenantsMap::Open(tenants);
     Ok(())
 }
 
@@ -116,6 +151,7 @@ pub fn schedule_local_tenant_processing(
     conf: &'static PageServerConf,
     tenant_path: &Path,
     remote_storage: Option<GenericRemoteStorage>,
+    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
     anyhow::ensure!(
         tenant_path.is_dir(),
@@ -150,7 +186,7 @@ pub fn schedule_local_tenant_processing(
     let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
         info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
         if let Some(remote_storage) = remote_storage {
-            Tenant::spawn_attach(conf, tenant_id, remote_storage)
+            Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx)
         } else {
             warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
             Tenant::create_broken_tenant(conf, tenant_id)
@@ -158,7 +194,7 @@ pub fn schedule_local_tenant_processing(
     } else {
         info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
         // Start loading the tenant into memory. It will initially be in Loading state.
-        Tenant::spawn_load(conf, tenant_id, remote_storage)
+        Tenant::spawn_load(conf, tenant_id, remote_storage, ctx)
     };
     Ok(tenant)
 }
@@ -166,21 +202,44 @@ pub fn schedule_local_tenant_processing(
 ///
 /// Shut down all tenants. This runs as part of pageserver shutdown.
 ///
+/// NB: We leave the tenants in the map, so that they remain accessible through
+/// the management API until we shut it down. If we removed the shut-down tenants
+/// from the tenants map, the management API would return 404 for these tenants,
+/// because TenantsMap::get() now returns `None`.
+/// That could be easily misinterpreted by control plane, the consumer of the
+/// management API. For example, it could attach the tenant on a different pageserver.
+/// We would then be in split-brain once this pageserver restarts.
 pub async fn shutdown_all_tenants() {
+    // Prevent new tenants from being created.
     let tenants_to_shut_down = {
         let mut m = TENANTS.write().await;
-        let mut tenants_to_shut_down = Vec::with_capacity(m.len());
-        for (_, tenant) in m.drain() {
-            if tenant.is_active() {
-                // updates tenant state, forbidding new GC and compaction iterations from starting
-                tenant.set_stopping();
-                tenants_to_shut_down.push(tenant)
+        match &mut *m {
+            TenantsMap::Initializing => {
+                *m = TenantsMap::ShuttingDown(HashMap::default());
+                info!("tenants map is empty");
+                return;
+            }
+            TenantsMap::Open(tenants) => {
+                let tenants_clone = tenants.clone();
+                *m = TenantsMap::ShuttingDown(std::mem::take(tenants));
+                tenants_clone
+            }
+            TenantsMap::ShuttingDown(_) => {
+                error!("already shutting down, this function isn't supposed to be called more than once");
+                return;
             }
         }
-        drop(m);
-        tenants_to_shut_down
     };
 
+    let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len());
+    for (_, tenant) in tenants_to_shut_down {
+        if tenant.is_active() {
+            // updates tenant state, forbidding new GC and compaction iterations from starting
+            tenant.set_stopping();
+            tenants_to_freeze_and_flush.push(tenant);
+        }
+    }
+
     // Shut down all existing walreceiver connections and stop accepting the new ones.
     task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
 
@@ -192,7 +251,7 @@ pub async fn shutdown_all_tenants() {
     // should be no more activity in any of the repositories.
     //
     // On error, log it but continue with the shutdown for other tenants.
-    for tenant in tenants_to_shut_down {
+    for tenant in tenants_to_freeze_and_flush {
         let tenant_id = tenant.tenant_id();
         debug!("shutdown tenant {tenant_id}");
 
@@ -207,27 +266,23 @@ pub async fn create_tenant(
     tenant_conf: TenantConfOpt,
     tenant_id: TenantId,
     remote_storage: Option<GenericRemoteStorage>,
-) -> anyhow::Result<Option<Arc<Tenant>>> {
-    match TENANTS.write().await.entry(tenant_id) {
-        hash_map::Entry::Occupied(_) => {
-            debug!("tenant {tenant_id} already exists");
-            Ok(None)
-        }
-        hash_map::Entry::Vacant(v) => {
-            // Hold the write_tenants() lock, since all of this is local IO.
-            // If this section ever becomes contentious, introduce a new `TenantState::Creating`.
-            let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
-            let created_tenant =
-                schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?;
-            let crated_tenant_id = created_tenant.tenant_id();
-            anyhow::ensure!(
+    ctx: &RequestContext,
+) -> Result<Arc<Tenant>, TenantMapInsertError> {
+    tenant_map_insert(tenant_id, |vacant_entry| {
+        // We're holding the tenants lock in write mode while doing local IO.
+        // If this section ever becomes contentious, introduce a new `TenantState::Creating`
+        // and do the work in that state.
+        let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
+        let created_tenant =
+            schedule_local_tenant_processing(conf, &tenant_directory, remote_storage, ctx)?;
+        let crated_tenant_id = created_tenant.tenant_id();
+        anyhow::ensure!(
                 tenant_id == crated_tenant_id,
                 "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
             );
-            v.insert(Arc::clone(&created_tenant));
-            Ok(Some(created_tenant))
-        }
-    }
+        vacant_entry.insert(Arc::clone(&created_tenant));
+        Ok(created_tenant)
+    }).await
 }
 
 pub async fn update_tenant_config(
@@ -236,10 +291,11 @@ pub async fn update_tenant_config(
     tenant_id: TenantId,
 ) -> anyhow::Result<()> {
     info!("configuring tenant {tenant_id}");
-    get_tenant(tenant_id, true)
-        .await?
-        .update_tenant_config(tenant_conf);
-    Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?;
+    let tenant = get_tenant(tenant_id, true).await?;
+
+    tenant.update_tenant_config(tenant_conf);
+    let tenant_config_path = conf.tenant_config_path(tenant_id);
+    Tenant::persist_tenant_config(&tenant.tenant_id(), &tenant_config_path, tenant_conf, false)?;
     Ok(())
 }
 
@@ -260,10 +316,14 @@ pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Resul
     }
 }
 
-pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> {
+pub async fn delete_timeline(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ctx: &RequestContext,
+) -> anyhow::Result<()> {
     match get_tenant(tenant_id, true).await {
         Ok(tenant) => {
-            tenant.delete_timeline(timeline_id).await?;
+            tenant.delete_timeline(timeline_id, ctx).await?;
         }
         Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"),
     }
@@ -291,8 +351,9 @@ pub async fn load_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,
     remote_storage: Option<GenericRemoteStorage>,
-) -> anyhow::Result<()> {
-    run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
+    ctx: &RequestContext,
+) -> Result<(), TenantMapInsertError> {
+    tenant_map_insert(tenant_id, |vacant_entry| {
         let tenant_path = conf.tenant_path(&tenant_id);
         let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
         if tenant_ignore_mark.exists() {
@@ -300,7 +361,7 @@ pub async fn load_tenant(
                 .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
         }
 
-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage, ctx)
             .with_context(|| {
                 format!("Failed to schedule tenant processing in path {tenant_path:?}")
             })?;
@@ -329,16 +390,24 @@ pub async fn ignore_tenant(
     .await
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum TenantMapListError {
+    #[error("tenant map is still initiailizing")]
+    Initializing,
+}
+
 ///
 /// Get list of tenants, for the mgmt API
 ///
-pub async fn list_tenants() -> Vec<(TenantId, TenantState)> {
-    TENANTS
-        .read()
-        .await
-        .iter()
+pub async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapListError> {
+    let tenants = TENANTS.read().await;
+    let m = match &*tenants {
+        TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
+        TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
+    };
+    Ok(m.iter()
         .map(|(id, tenant)| (*id, tenant.current_state()))
-        .collect()
+        .collect())
 }
 
 /// Execute Attach mgmt API command.
@@ -349,34 +418,62 @@ pub async fn attach_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,
     remote_storage: GenericRemoteStorage,
-) -> anyhow::Result<()> {
-    run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
+    ctx: &RequestContext,
+) -> Result<(), TenantMapInsertError> {
+    tenant_map_insert(tenant_id, |vacant_entry| {
         let tenant_path = conf.tenant_path(&tenant_id);
         anyhow::ensure!(
             !tenant_path.exists(),
             "Cannot attach tenant {tenant_id}, local tenant directory already exists"
         );
 
-        let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage);
+        let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx);
         vacant_entry.insert(tenant);
-
         Ok(())
     })
     .await
 }
 
-async fn run_if_no_tenant_in_memory<F, V>(tenant_id: TenantId, run: F) -> anyhow::Result<V>
+#[derive(Debug, thiserror::Error)]
+pub enum TenantMapInsertError {
+    #[error("tenant map is still initializing")]
+    StillInitializing,
+    #[error("tenant map is shutting down")]
+    ShuttingDown,
+    #[error("tenant {0} already exists, state: {1:?}")]
+    TenantAlreadyExists(TenantId, TenantState),
+    #[error(transparent)]
+    Closure(#[from] anyhow::Error),
+}
+
+/// Give the given closure access to the tenants map entry for the given `tenant_id`, iff that
+/// entry is vacant. The closure is responsible for creating the tenant object and inserting
+/// it into the tenants map through the vacnt entry that it receives as argument.
+///
+/// NB: the closure should return quickly because the current implementation of tenants map
+/// serializes access through an `RwLock`.
+async fn tenant_map_insert<F, V>(
+    tenant_id: TenantId,
+    insert_fn: F,
+) -> Result<V, TenantMapInsertError>
 where
     F: FnOnce(hash_map::VacantEntry<TenantId, Arc<Tenant>>) -> anyhow::Result<V>,
 {
-    match TENANTS.write().await.entry(tenant_id) {
-        hash_map::Entry::Occupied(e) => {
-            anyhow::bail!(
-                "tenant {tenant_id} already exists, state: {:?}",
-                e.get().current_state()
-            )
-        }
-        hash_map::Entry::Vacant(v) => run(v),
+    let mut guard = TENANTS.write().await;
+    let m = match &mut *guard {
+        TenantsMap::Initializing => return Err(TenantMapInsertError::StillInitializing),
+        TenantsMap::ShuttingDown(_) => return Err(TenantMapInsertError::ShuttingDown),
+        TenantsMap::Open(m) => m,
+    };
+    match m.entry(tenant_id) {
+        hash_map::Entry::Occupied(e) => Err(TenantMapInsertError::TenantAlreadyExists(
+            tenant_id,
+            e.get().current_state(),
+        )),
+        hash_map::Entry::Vacant(v) => match insert_fn(v) {
+            Ok(v) => Ok(v),
+            Err(e) => Err(TenantMapInsertError::Closure(e)),
+        },
     }
 }
 
@@ -449,9 +546,9 @@ pub async fn immediate_gc(
     tenant_id: TenantId,
     timeline_id: TimelineId,
     gc_req: TimelineGcRequest,
+    ctx: &RequestContext,
 ) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
     let guard = TENANTS.read().await;
-
     let tenant = guard
         .get(&tenant_id)
         .map(Arc::clone)
@@ -462,7 +559,8 @@ pub async fn immediate_gc(
     // Use tenant's pitr setting
     let pitr = tenant.get_pitr_interval();
 
-    // Run in task_mgr to avoid race with detach operation
+    // Run in task_mgr to avoid race with tenant_detach operation
+    let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
     let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
     task_mgr::spawn(
         &tokio::runtime::Handle::current(),
@@ -474,7 +572,7 @@ pub async fn immediate_gc(
         async move {
             fail::fail_point!("immediate_gc_task_pre");
             let result = tenant
-                .gc_iteration(Some(timeline_id), gc_horizon, pitr)
+                .gc_iteration(Some(timeline_id), gc_horizon, pitr, &ctx)
                 .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
                 .await;
                 // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
@@ -497,6 +595,7 @@ pub async fn immediate_gc(
 pub async fn immediate_compact(
     tenant_id: TenantId,
     timeline_id: TimelineId,
+    ctx: &RequestContext,
 ) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
     let guard = TENANTS.read().await;
 
@@ -510,7 +609,8 @@ pub async fn immediate_compact(
         .get_timeline(timeline_id, true)
         .map_err(ApiError::NotFound)?;
 
-    // Run in task_mgr to avoid race with detach operation
+    // Run in task_mgr to avoid race with tenant_detach operation
+    let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
     let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
     task_mgr::spawn(
         &tokio::runtime::Handle::current(),
@@ -523,7 +623,7 @@ pub async fn immediate_compact(
         false,
         async move {
             let result = timeline
-                .compact()
+                .compact(&ctx)
                 .instrument(
                     info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id),
                 )
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 013591caee..3f69017160 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1010,7 +1010,10 @@ impl RemoteTimelineClient {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
+    use crate::{
+        tenant::harness::{TenantHarness, TIMELINE_ID},
+        DEFAULT_PG_VERSION,
+    };
     use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
     use std::{collections::HashSet, path::Path};
     use utils::lsn::Lsn;
@@ -1064,9 +1067,19 @@ mod tests {
     // Test scheduling
     #[test]
     fn upload_scheduling() -> anyhow::Result<()> {
+        // Use a current-thread runtime in the test
+        let runtime = Box::leak(Box::new(
+            tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()?,
+        ));
+        let _entered = runtime.enter();
+
         let harness = TenantHarness::create("upload_scheduling")?;
+        let (tenant, ctx) = runtime.block_on(harness.load());
+        let _timeline =
+            tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
         let timeline_path = harness.timeline_path(&TIMELINE_ID);
-        std::fs::create_dir_all(&timeline_path)?;
 
         let remote_fs_dir = harness.conf.workdir.join("remote_fs");
         std::fs::create_dir_all(remote_fs_dir)?;
@@ -1084,14 +1097,6 @@ mod tests {
             storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
         };
 
-        // Use a current-thread runtime in the test
-        let runtime = Box::leak(Box::new(
-            tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()?,
-        ));
-        let _entered = runtime.enter();
-
         // Test outline:
         //
         // Schedule upload of a bunch of layers. Check that they are started immediately, not queued
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 61cb32fc76..2fed4f88b3 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -6,6 +6,7 @@ use anyhow::Context;
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 
+use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 
 use super::Tenant;
@@ -181,6 +182,7 @@ pub(super) async fn gather_inputs(
     tenant: &Tenant,
     limit: &Arc<Semaphore>,
     logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
+    ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
     // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
     // our advantage with `?` error handling.
@@ -188,7 +190,7 @@ pub(super) async fn gather_inputs(
 
     // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
     tenant
-        .refresh_gc_info()
+        .refresh_gc_info(ctx)
         .await
         .context("Failed to refresh gc_info before gathering inputs")?;
 
@@ -329,7 +331,13 @@ pub(super) async fn gather_inputs(
             } else {
                 let timeline = Arc::clone(&timeline);
                 let parallel_size_calcs = Arc::clone(limit);
-                joinset.spawn(calculate_logical_size(parallel_size_calcs, timeline, *lsn));
+                let ctx = ctx.attached_child();
+                joinset.spawn(calculate_logical_size(
+                    parallel_size_calcs,
+                    timeline,
+                    *lsn,
+                    ctx,
+                ));
             }
         }
 
@@ -387,6 +395,7 @@ pub(super) async fn gather_inputs(
                 parallel_size_calcs,
                 timeline.clone(),
                 lsn,
+                ctx.attached_child(),
             ));
 
             if let Some(parent_id) = timeline.get_ancestor_timeline_id() {
@@ -582,13 +591,14 @@ async fn calculate_logical_size(
     limit: Arc<tokio::sync::Semaphore>,
     timeline: Arc<crate::tenant::Timeline>,
     lsn: utils::lsn::Lsn,
+    ctx: RequestContext,
 ) -> Result<TimelineAtLsnSizeResult, RecvError> {
     let _permit = tokio::sync::Semaphore::acquire_owned(limit)
         .await
         .expect("global semaphore should not had been closed");
 
     let size_res = timeline
-        .spawn_ondemand_logical_size_calculation(lsn)
+        .spawn_ondemand_logical_size_calculation(lsn, ctx)
         .instrument(info_span!("spawn_ondemand_logical_size_calculation"))
         .await?;
     Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 6aee8ce23c..2149fc7eb7 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -196,3 +196,50 @@ pub fn downcast_remote_layer(
         None
     }
 }
+
+impl std::fmt::Debug for dyn Layer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Layer")
+            .field("short_id", &self.short_id())
+            .finish()
+    }
+}
+
+/// Holds metadata about a layer without any content. Used mostly for testing.
+pub struct LayerDescriptor {
+    pub key: Range<Key>,
+    pub lsn: Range<Lsn>,
+    pub is_incremental: bool,
+    pub short_id: String,
+}
+
+impl Layer for LayerDescriptor {
+    fn get_key_range(&self) -> Range<Key> {
+        self.key.clone()
+    }
+
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.lsn.clone()
+    }
+
+    fn is_incremental(&self) -> bool {
+        self.is_incremental
+    }
+
+    fn get_value_reconstruct_data(
+        &self,
+        _key: Key,
+        _lsn_range: Range<Lsn>,
+        _reconstruct_data: &mut ValueReconstructState,
+    ) -> Result<ValueReconstructResult> {
+        todo!("This method shouldn't be part of the Layer trait")
+    }
+
+    fn short_id(&self) -> String {
+        self.short_id.clone()
+    }
+
+    fn dump(&self, _verbose: bool) -> Result<()> {
+        todo!()
+    }
+}
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index b7ad8fe791..b126545ee4 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -5,6 +5,7 @@ use std::ops::ControlFlow;
 use std::sync::Arc;
 use std::time::Duration;
 
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
@@ -52,19 +53,20 @@ async fn compaction_loop(tenant_id: TenantId) {
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
+        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
         loop {
             trace!("waking up");
 
             let tenant = tokio::select! {
                 _ = task_mgr::shutdown_watcher() => {
                     info!("received cancellation request");
-                    return;
+                return;
                 },
                 tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
                     ControlFlow::Break(()) => return,
                     ControlFlow::Continue(tenant) => tenant,
                 },
-            };
+        };
 
             let mut sleep_duration = tenant.get_compaction_period();
             if sleep_duration == Duration::ZERO {
@@ -73,7 +75,7 @@ async fn compaction_loop(tenant_id: TenantId) {
                 sleep_duration = Duration::from_secs(10);
             } else {
                 // Run compaction
-                if let Err(e) = tenant.compaction_iteration().await {
+                if let Err(e) = tenant.compaction_iteration(&ctx).await {
                     sleep_duration = wait_duration;
                     error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
                 }
@@ -103,6 +105,9 @@ async fn gc_loop(tenant_id: TenantId) {
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
+        // GC might require downloading, to find the cutoff LSN that corresponds to the
+        // cutoff specified as time.
+        let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
         loop {
             trace!("waking up");
 
@@ -127,7 +132,7 @@ async fn gc_loop(tenant_id: TenantId) {
             } else {
                 // Run gc
                 if gc_horizon > 0 {
-                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval()).await
+                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await
                     {
                         sleep_duration = wait_duration;
                         error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d59858f582..0ca8a0e491 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,5 +1,7 @@
 //!
 
+mod walreceiver;
+
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::Bytes;
 use fail::fail_point;
@@ -13,6 +15,7 @@ use pageserver_api::models::{
 use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::id::TenantTimelineId;
 
 use std::cmp::{max, min, Ordering};
 use std::collections::HashMap;
@@ -23,6 +26,8 @@ use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
+use crate::broker_client::is_broker_client_initialized;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
 use crate::tenant::storage_layer::{
     DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer, LayerFileName,
@@ -58,11 +63,11 @@ use crate::page_cache;
 use crate::repository::GcResult;
 use crate::repository::{Key, Value};
 use crate::task_mgr::TaskKind;
-use crate::walreceiver::{is_broker_client_initialized, spawn_connection_manager_task};
 use crate::walredo::WalRedoManager;
 use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};
+use walreceiver::spawn_connection_manager_task;
 
 use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
@@ -128,7 +133,6 @@ pub struct Timeline {
     ancestor_timeline: Option<Arc<Timeline>>,
     ancestor_lsn: Lsn,
 
-    // Metrics
     metrics: TimelineMetrics,
 
     /// Ensures layers aren't frozen by checkpointer between
@@ -377,6 +381,12 @@ pub enum PageReconstructError {
     #[error(transparent)]
     Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error
 
+    /// The operation would require downloading a layer that is missing locally.
+    NeedsDownload(TenantTimelineId, LayerFileName),
+
+    /// The operation was cancelled
+    Cancelled,
+
     /// An error happened replaying WAL records
     #[error(transparent)]
     WalRedo(#[from] crate::walredo::WalRedoError),
@@ -386,6 +396,33 @@ impl std::fmt::Debug for PageReconstructError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
         match self {
             Self::Other(err) => err.fmt(f),
+            Self::NeedsDownload(tenant_timeline_id, layer_file_name) => {
+                write!(
+                    f,
+                    "layer {}/{} needs download",
+                    tenant_timeline_id,
+                    layer_file_name.file_name()
+                )
+            }
+            Self::Cancelled => write!(f, "cancelled"),
+            Self::WalRedo(err) => err.fmt(f),
+        }
+    }
+}
+
+impl std::fmt::Display for PageReconstructError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            Self::Other(err) => err.fmt(f),
+            Self::NeedsDownload(tenant_timeline_id, layer_file_name) => {
+                write!(
+                    f,
+                    "layer {}/{} needs download",
+                    tenant_timeline_id,
+                    layer_file_name.file_name()
+                )
+            }
+            Self::Cancelled => write!(f, "cancelled"),
             Self::WalRedo(err) => err.fmt(f),
         }
     }
@@ -422,11 +459,24 @@ impl Timeline {
     /// an ancestor branch, for example, or waste a lot of cycles chasing the
     /// non-existing key.
     ///
-    pub async fn get(&self, key: Key, lsn: Lsn) -> Result<Bytes, PageReconstructError> {
+    pub async fn get(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
         if !lsn.is_valid() {
             return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
         }
 
+        // XXX: structured stats collection for layer eviction here.
+        trace!(
+            "get page request for {}@{} from task kind {:?}",
+            key,
+            lsn,
+            ctx.task_kind()
+        );
+
         // Check the page cache. We will get back the most recent page with lsn <= `lsn`.
         // The cached image can be returned directly if there is no WAL between the cached image
         // and requested LSN. The cached image can also be used to reduce the amount of WAL needed
@@ -450,7 +500,7 @@ impl Timeline {
             img: cached_page_img,
         };
 
-        self.get_reconstruct_data(key, lsn, &mut reconstruct_state)
+        self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
             .await?;
 
         self.metrics
@@ -513,13 +563,25 @@ impl Timeline {
     /// You should call this before any of the other get_* or list_* functions. Calling
     /// those functions with an LSN that has been processed yet is an error.
     ///
-    pub async fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> {
+    pub async fn wait_lsn(
+        &self,
+        lsn: Lsn,
+        _ctx: &RequestContext, /* Prepare for use by cancellation */
+    ) -> anyhow::Result<()> {
         anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline");
 
         // This should never be called from the WAL receiver, because that could lead
         // to a deadlock.
         anyhow::ensure!(
-            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnection),
+            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
+            "wait_lsn cannot be called in WAL receiver"
+        );
+        anyhow::ensure!(
+            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
+            "wait_lsn cannot be called in WAL receiver"
+        );
+        anyhow::ensure!(
+            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
             "wait_lsn cannot be called in WAL receiver"
         );
 
@@ -558,7 +620,7 @@ impl Timeline {
         self.flush_frozen_layers_and_wait().await
     }
 
-    pub async fn compact(&self) -> anyhow::Result<()> {
+    pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
         let last_record_lsn = self.get_last_record_lsn();
 
         // Last record Lsn could be zero in case the timeline was just created
@@ -616,14 +678,16 @@ impl Timeline {
             .repartition(
                 self.get_last_record_lsn(),
                 self.get_compaction_target_size(),
+                ctx,
             )
             .await
         {
             Ok((partitioning, lsn)) => {
                 // 2. Create new image layers for partitions that have been modified
                 // "enough".
-                let layer_paths_to_upload =
-                    self.create_image_layers(&partitioning, lsn, false).await?;
+                let layer_paths_to_upload = self
+                    .create_image_layers(&partitioning, lsn, false, ctx)
+                    .await?;
                 if let Some(remote_client) = &self.remote_client {
                     for (path, layer_metadata) in layer_paths_to_upload {
                         remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
@@ -673,7 +737,10 @@ impl Timeline {
     /// the initial size calculation has not been run (gets triggered on the first size access).
     ///
     /// return size and boolean flag that shows if the size is exact
-    pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<(u64, bool)> {
+    pub fn get_current_logical_size(
+        self: &Arc<Self>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<(u64, bool)> {
         let current_size = self.current_logical_size.current_size()?;
         debug!("Current size: {current_size:?}");
 
@@ -683,7 +750,7 @@ impl Timeline {
             (current_size, self.current_logical_size.initial_part_end)
         {
             is_exact = false;
-            self.try_spawn_size_init_task(init_lsn);
+            self.try_spawn_size_init_task(init_lsn, ctx);
         }
 
         Ok((size, is_exact))
@@ -729,16 +796,24 @@ impl Timeline {
         Ok(())
     }
 
+    pub fn activate(self: &Arc<Self>) {
+        self.set_state(TimelineState::Active);
+        self.launch_wal_receiver();
+    }
+
     pub fn set_state(&self, new_state: TimelineState) {
         match (self.current_state(), new_state) {
             (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
-                debug!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
+                warn!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
+            }
+            (st, TimelineState::Loading) => {
+                error!("ignoring transition from {st:?} into Loading state");
             }
             (TimelineState::Broken, _) => {
                 error!("Ignoring state update {new_state:?} for broken tenant");
             }
             (TimelineState::Stopping, TimelineState::Active) => {
-                debug!("Not activating a Stopping timeline");
+                error!("Not activating a Stopping timeline");
             }
             (_, new_state) => {
                 self.state.send_replace(new_state);
@@ -812,7 +887,7 @@ impl Timeline {
         pg_version: u32,
     ) -> Arc<Self> {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
-        let (state, _) = watch::channel(TimelineState::Suspended);
+        let (state, _) = watch::channel(TimelineState::Loading);
 
         let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
         let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
@@ -884,6 +959,10 @@ impl Timeline {
             };
             result.repartition_threshold = result.get_checkpoint_distance() / 10;
             result
+                .metrics
+                .last_record_gauge
+                .set(disk_consistent_lsn.0 as i64);
+            result
         })
     }
 
@@ -909,22 +988,25 @@ impl Timeline {
 
         let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
         let self_clone = Arc::clone(self);
+
         info!("spawning flush loop");
         task_mgr::spawn(
-                    task_mgr::BACKGROUND_RUNTIME.handle(),
-                    task_mgr::TaskKind::LayerFlushTask,
-                    Some(self.tenant_id),
-                    Some(self.timeline_id),
-                    "layer flush task",
-                    false,
-                    async move {
-                         self_clone.flush_loop(layer_flush_start_rx).await;
-                         let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
-                         assert_eq!(*flush_loop_state, FlushLoopState::Running);
-                         *flush_loop_state  = FlushLoopState::Exited;
-                         Ok(()) }
-                    .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
-                );
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            task_mgr::TaskKind::LayerFlushTask,
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+            "layer flush task",
+            false,
+            async move {
+                let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
+                self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
+                let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
+                assert_eq!(*flush_loop_state, FlushLoopState::Running);
+                *flush_loop_state  = FlushLoopState::Exited;
+                Ok(())
+            }
+            .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
+        );
 
         *flush_loop_state = FlushLoopState::Running;
     }
@@ -955,12 +1037,16 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
         drop(tenant_conf_guard);
         let self_clone = Arc::clone(self);
+        let background_ctx =
+            // XXX: this is a detached_child. Plumb through the ctx from call sites.
+            RequestContext::todo_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
         spawn_connection_manager_task(
             self_clone,
             walreceiver_connect_timeout,
             lagging_wal_timeout,
             max_lsn_wal_lag,
             crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
+            background_ctx,
         );
     }
 
@@ -970,6 +1056,7 @@ impl Timeline {
     ///
     pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
         let mut layers = self.layers.write().unwrap();
+        let mut updates = layers.batch_update();
         let mut num_layers = 0;
 
         let timer = self.metrics.load_layer_map_histo.start_timer();
@@ -1010,7 +1097,7 @@ impl Timeline {
 
                 trace!("found layer {}", layer.path().display());
                 total_physical_size += file_size;
-                layers.insert_historic(Arc::new(layer));
+                updates.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
                 // Create a DeltaLayer struct for each delta file.
@@ -1041,7 +1128,7 @@ impl Timeline {
 
                 trace!("found layer {}", layer.path().display());
                 total_physical_size += file_size;
-                layers.insert_historic(Arc::new(layer));
+                updates.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                 // ignore these
@@ -1067,6 +1154,7 @@ impl Timeline {
             }
         }
 
+        updates.flush();
         layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1);
 
         info!(
@@ -1091,6 +1179,11 @@ impl Timeline {
         // Are we missing some files that are present in remote storage?
         // Create RemoteLayer instances for them.
         let mut local_only_layers = local_layers;
+
+        // We're holding a layer map lock for a while but this
+        // method is only called during init so it's fine.
+        let mut layer_map = self.layers.write().unwrap();
+        let mut updates = layer_map.batch_update();
         for remote_layer_name in &index_part.timeline_layers {
             let local_layer = local_only_layers.remove(remote_layer_name);
 
@@ -1129,7 +1222,7 @@ impl Timeline {
                             anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
                         } else {
                             self.metrics.resident_physical_size_gauge.sub(local_size);
-                            self.layers.write().unwrap().remove_historic(local_layer);
+                            updates.remove_historic(local_layer);
                             // fall-through to adding the remote layer
                         }
                     } else {
@@ -1171,7 +1264,7 @@ impl Timeline {
                     );
                     let remote_layer = Arc::new(remote_layer);
 
-                    self.layers.write().unwrap().insert_historic(remote_layer);
+                    updates.insert_historic(remote_layer);
                 }
                 LayerFileName::Delta(deltafilename) => {
                     // Create a RemoteLayer for the delta file.
@@ -1194,13 +1287,14 @@ impl Timeline {
                         &remote_layer_metadata,
                     );
                     let remote_layer = Arc::new(remote_layer);
-                    self.layers.write().unwrap().insert_historic(remote_layer);
+                    updates.insert_historic(remote_layer);
                 }
                 #[cfg(test)]
                 LayerFileName::Test(_) => unreachable!(),
             }
         }
 
+        updates.flush();
         Ok(local_only_layers)
     }
 
@@ -1280,7 +1374,7 @@ impl Timeline {
         Ok(())
     }
 
-    fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn) {
+    fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn, ctx: &RequestContext) {
         let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
             .try_acquire_owned()
         {
@@ -1296,8 +1390,18 @@ impl Timeline {
             .initial_logical_size
             .get()
             .is_none());
+
+        info!(
+            "spawning logical size computation from context of task kind {:?}",
+            ctx.task_kind()
+        );
         // We need to start the computation task.
+        // It gets a separate context since it will outlive the request that called this function.
         let self_clone = Arc::clone(self);
+        let background_ctx = ctx.detached_child(
+            TaskKind::InitialLogicalSizeCalculation,
+            DownloadBehavior::Download,
+        );
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::InitialLogicalSizeCalculation,
@@ -1307,7 +1411,9 @@ impl Timeline {
             false,
             // NB: don't log errors here, task_mgr will do that.
             async move {
-                let calculated_size = match self_clone.logical_size_calculation_task(init_lsn).await
+                let calculated_size = match self_clone
+                    .logical_size_calculation_task(init_lsn, &background_ctx)
+                    .await
                 {
                     Ok(s) => s,
                     Err(CalculateLogicalSizeError::Cancelled) => {
@@ -1342,18 +1448,27 @@ impl Timeline {
     pub fn spawn_ondemand_logical_size_calculation(
         self: &Arc<Self>,
         lsn: Lsn,
+        ctx: RequestContext,
     ) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
         let (sender, receiver) = oneshot::channel();
         let self_clone = Arc::clone(self);
+        // XXX if our caller loses interest, i.e., ctx is cancelled,
+        // we should stop the size calculation work and return an error.
+        // That would require restructuring this function's API to
+        // return the result directly, instead of a Receiver for the result.
+        let ctx = ctx.detached_child(
+            TaskKind::OndemandLogicalSizeCalculation,
+            DownloadBehavior::Download,
+        );
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
-            task_mgr::TaskKind::InitialLogicalSizeCalculation,
+            task_mgr::TaskKind::OndemandLogicalSizeCalculation,
             Some(self.tenant_id),
             Some(self.timeline_id),
             "ondemand logical size calculation",
             false,
             async move {
-                let res = self_clone.logical_size_calculation_task(lsn).await;
+                let res = self_clone.logical_size_calculation_task(lsn, &ctx).await;
                 let _ = sender.send(res).ok();
                 Ok(()) // Receiver is responsible for handling errors
             },
@@ -1365,6 +1480,7 @@ impl Timeline {
     async fn logical_size_calculation_task(
         self: &Arc<Self>,
         init_lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
         let mut timeline_state_updates = self.subscribe_for_state_updates();
         let self_calculation = Arc::clone(self);
@@ -1372,12 +1488,13 @@ impl Timeline {
 
         let calculation = async {
             let cancel = cancel.child_token();
+            let ctx = ctx.attached_child();
             tokio::task::spawn_blocking(move || {
                 // Run in a separate thread since this can do a lot of
                 // synchronous file IO without .await inbetween
                 // if there are no RemoteLayers that would require downloading.
                 let h = tokio::runtime::Handle::current();
-                h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel))
+                h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel, &ctx))
             })
             .await
             .context("Failed to spawn calculation result task")?
@@ -1392,7 +1509,7 @@ impl Timeline {
                             TimelineState::Active => continue,
                             TimelineState::Broken
                             | TimelineState::Stopping
-                            | TimelineState::Suspended => {
+                            | TimelineState::Loading => {
                                 break format!("aborted because timeline became inactive (new state: {new_state:?})")
                             }
                         }
@@ -1432,10 +1549,11 @@ impl Timeline {
     ///
     /// NOTE: counted incrementally, includes ancestors. This can be a slow operation,
     /// especially if we need to download remote layers.
-    async fn calculate_logical_size(
+    pub async fn calculate_logical_size(
         &self,
         up_to_lsn: Lsn,
         cancel: CancellationToken,
+        ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
         info!(
             "Calculating logical size for timeline {} at {}",
@@ -1478,7 +1596,7 @@ impl Timeline {
             self.metrics.logical_size_histo.start_timer()
         };
         let logical_size = self
-            .get_current_logical_size_non_incremental(up_to_lsn, cancel)
+            .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
             .await?;
         debug!("calculated logical size: {logical_size}");
         timer.stop_and_record();
@@ -1555,6 +1673,7 @@ impl Timeline {
         key: Key,
         request_lsn: Lsn,
         reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
     ) -> Result<(), PageReconstructError> {
         // Start from the current timeline.
         let mut timeline_owned;
@@ -1742,14 +1861,43 @@ impl Timeline {
                 let remote_layer_as_persistent: Arc<dyn PersistentLayer> =
                     Arc::clone(&remote_layer) as Arc<dyn PersistentLayer>;
                 let id = remote_layer_as_persistent.traversal_id();
-                info!("need remote layer {id}");
+                info!(
+                    "need remote layer {} for task kind {:?}",
+                    id,
+                    ctx.task_kind()
+                );
 
                 // The next layer doesn't exist locally. Need to download it.
                 // (The control flow is a bit complicated here because we must drop the 'layers'
                 // lock before awaiting on the Future.)
-                info!("on-demand downloading remote layer {id}");
-                timeline.download_remote_layer(remote_layer).await?;
-                continue 'layer_map_search;
+                match (
+                    ctx.download_behavior(),
+                    self.conf.ondemand_download_behavior_treat_error_as_warn,
+                ) {
+                    (DownloadBehavior::Download, _) => {
+                        info!(
+                            "on-demand downloading remote layer {id} for task kind {:?}",
+                            ctx.task_kind()
+                        );
+                        timeline.download_remote_layer(remote_layer).await?;
+                        continue 'layer_map_search;
+                    }
+                    (DownloadBehavior::Warn, _) | (DownloadBehavior::Error, true) => {
+                        warn!(
+                            "unexpectedly on-demand downloading remote layer {} for task kind {:?}",
+                            id,
+                            ctx.task_kind()
+                        );
+                        timeline.download_remote_layer(remote_layer).await?;
+                        continue 'layer_map_search;
+                    }
+                    (DownloadBehavior::Error, false) => {
+                        return Err(PageReconstructError::NeedsDownload(
+                            TenantTimelineId::new(self.tenant_id, self.timeline_id),
+                            remote_layer.file_name.clone(),
+                        ))
+                    }
+                }
             }
         }
     }
@@ -1871,7 +2019,11 @@ impl Timeline {
     }
 
     /// Layer flusher task's main loop.
-    async fn flush_loop(&self, mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>) {
+    async fn flush_loop(
+        &self,
+        mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
+        ctx: &RequestContext,
+    ) {
         info!("started flush loop");
         loop {
             tokio::select! {
@@ -1892,7 +2044,7 @@ impl Timeline {
                     // drop 'layers' lock to allow concurrent reads and writes
                 };
                 if let Some(layer_to_flush) = layer_to_flush {
-                    if let Err(err) = self.flush_frozen_layer(layer_to_flush).await {
+                    if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await {
                         error!("could not flush frozen layer: {err:?}");
                         break Err(err);
                     }
@@ -1957,8 +2109,12 @@ impl Timeline {
     }
 
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    #[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
-    async fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
+    #[instrument(skip(self, frozen_layer, ctx), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
+    async fn flush_frozen_layer(
+        &self,
+        frozen_layer: Arc<InMemoryLayer>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         // As a special case, when we have just imported an image into the repository,
         // instead of writing out a L0 delta layer, we directly write out image layer
         // files instead. This is possible as long as *all* the data imported into the
@@ -1966,10 +2122,12 @@ impl Timeline {
         let lsn_range = frozen_layer.get_lsn_range();
         let layer_paths_to_upload =
             if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
+                // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
+                // require downloading anything during initial import.
                 let (partitioning, _lsn) = self
-                    .repartition(self.initdb_lsn, self.get_compaction_target_size())
+                    .repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx)
                     .await?;
-                self.create_image_layers(&partitioning, self.initdb_lsn, true)
+                self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
                     .await?
             } else {
                 // normal case, write out a L0 delta layer file.
@@ -2099,10 +2257,11 @@ impl Timeline {
         ])?;
 
         // Add it to the layer map
-        {
-            let mut layers = self.layers.write().unwrap();
-            layers.insert_historic(Arc::new(new_delta));
-        }
+        self.layers
+            .write()
+            .unwrap()
+            .batch_update()
+            .insert_historic(Arc::new(new_delta));
 
         // update the timeline's physical size
         let sz = new_delta_path.metadata()?.len();
@@ -2119,6 +2278,7 @@ impl Timeline {
         &self,
         lsn: Lsn,
         partition_size: u64,
+        ctx: &RequestContext,
     ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
         {
             let partitioning_guard = self.partitioning.lock().unwrap();
@@ -2129,7 +2289,7 @@ impl Timeline {
                 return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
             }
         }
-        let keyspace = self.collect_keyspace(lsn).await?;
+        let keyspace = self.collect_keyspace(lsn, ctx).await?;
         let partitioning = keyspace.partition(partition_size);
 
         let mut partitioning_guard = self.partitioning.lock().unwrap();
@@ -2166,13 +2326,15 @@ impl Timeline {
                 // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed
                 // after we read last_record_lsn, which is passed here in the 'lsn' argument.
                 if img_lsn < lsn {
-                    let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?;
+                    let threshold = self.get_image_creation_threshold();
+                    let num_deltas =
+                        layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
 
                     debug!(
                         "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
                         img_range.start, img_range.end, num_deltas, img_lsn, lsn
                     );
-                    if num_deltas >= self.get_image_creation_threshold() {
+                    if num_deltas >= threshold {
                         return Ok(true);
                     }
                 }
@@ -2187,6 +2349,7 @@ impl Timeline {
         partitioning: &KeyPartitioning,
         lsn: Lsn,
         force: bool,
+        ctx: &RequestContext,
     ) -> Result<HashMap<LayerFileName, LayerFileMetadata>, PageReconstructError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
         let mut image_layers: Vec<ImageLayer> = Vec::new();
@@ -2211,7 +2374,7 @@ impl Timeline {
                 for range in &partition.ranges {
                     let mut key = range.start;
                     while key < range.end {
-                        let img = match self.get(key, lsn).await {
+                        let img = match self.get(key, lsn, ctx).await {
                             Ok(img) => img,
                             Err(err) => {
                                 // If we fail to reconstruct a VM or FSM page, we can zero the
@@ -2267,21 +2430,23 @@ impl Timeline {
         let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
 
         let mut layers = self.layers.write().unwrap();
+        let mut updates = layers.batch_update();
         let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
         for l in image_layers {
             let path = l.filename();
             let metadata = timeline_path
                 .join(path.file_name())
                 .metadata()
-                .context("reading metadata of layer file {path}")?;
+                .with_context(|| format!("reading metadata of layer file {}", path.file_name()))?;
 
             layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len()));
 
             self.metrics
                 .resident_physical_size_gauge
                 .add(metadata.len());
-            layers.insert_historic(Arc::new(l));
+            updates.insert_historic(Arc::new(l));
         }
+        updates.flush();
         drop(layers);
         timer.stop_and_record();
 
@@ -2577,6 +2742,7 @@ impl Timeline {
         }
 
         let mut layers = self.layers.write().unwrap();
+        let mut updates = layers.batch_update();
         let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
         for l in new_layers {
             let new_delta_path = l.path();
@@ -2597,7 +2763,7 @@ impl Timeline {
 
             new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
             let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
-            layers.insert_historic(x);
+            updates.insert_historic(x);
         }
 
         // Now that we have reshuffled the data to set of new delta layers, we can
@@ -2611,8 +2777,9 @@ impl Timeline {
             }
             layer_names_to_delete.push(l.filename());
             l.delete()?;
-            layers.remove_historic(l);
+            updates.remove_historic(l);
         }
+        updates.flush();
         drop(layers);
 
         // Also schedule the deletions in remote storage
@@ -2662,6 +2829,7 @@ impl Timeline {
         retain_lsns: Vec<Lsn>,
         cutoff_horizon: Lsn,
         pitr: Duration,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
         //
@@ -2674,7 +2842,7 @@ impl Timeline {
             if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
                 let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
 
-                match self.find_lsn_for_timestamp(pitr_timestamp).await? {
+                match self.find_lsn_for_timestamp(pitr_timestamp, ctx).await? {
                     LsnForTimestamp::Present(lsn) => lsn,
                     LsnForTimestamp::Future(lsn) => {
                         // The timestamp is in the future. That sounds impossible,
@@ -2725,6 +2893,8 @@ impl Timeline {
     /// obsolete.
     ///
     pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
+        let timer = self.metrics.garbage_collect_histo.start_timer();
+
         fail_point!("before-timeline-gc");
 
         let _layer_removal_cs = self.layer_removal_cs.lock().await;
@@ -2745,11 +2915,17 @@ impl Timeline {
 
         let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
 
-        self.gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
+        let res = self
+            .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
             .instrument(
                 info_span!("gc_timeline", timeline = %self.timeline_id, cutoff = %new_gc_cutoff),
             )
-            .await
+            .await?;
+
+        // only record successes
+        timer.stop_and_record();
+
+        Ok(res)
     }
 
     async fn gc_timeline(
@@ -2812,6 +2988,7 @@ impl Timeline {
         // 3. it doesn't need to be retained for 'retain_lsns';
         // 4. newer on-disk image layers cover the layer's whole key range
         //
+        // TODO holding a write lock is too agressive and avoidable
         let mut layers = self.layers.write().unwrap();
         'outer: for l in layers.iter_historic_layers() {
             result.layers_total += 1;
@@ -2843,6 +3020,8 @@ impl Timeline {
             // might be referenced by child branches forever.
             // We can track this in child timeline GC and delete parent layers when
             // they are no longer needed. This might be complicated with long inheritance chains.
+            //
+            // TODO Vec is not a great choice for `retain_lsns`
             for retain_lsn in &retain_lsns {
                 // start_lsn is inclusive
                 if &l.get_lsn_range().start <= retain_lsn {
@@ -2896,6 +3075,7 @@ impl Timeline {
             layers_to_remove.push(Arc::clone(&l));
         }
 
+        let mut updates = layers.batch_update();
         if !layers_to_remove.is_empty() {
             // Persist the new GC cutoff value in the metadata file, before
             // we actually remove anything.
@@ -2913,7 +3093,13 @@ impl Timeline {
                 }
                 layer_names_to_delete.push(doomed_layer.filename());
                 doomed_layer.delete()?; // FIXME: schedule succeeded deletions before returning?
-                layers.remove_historic(doomed_layer);
+
+                // TODO Removing from the bottom of the layer map is expensive.
+                //      Maybe instead discard all layer map historic versions that
+                //      won't be needed for page reconstruction for this timeline,
+                //      and mark what we can't delete yet as deleted from the layer
+                //      map index without actually rebuilding the index.
+                updates.remove_historic(doomed_layer);
                 result.layers_removed += 1;
             }
 
@@ -2925,6 +3111,7 @@ impl Timeline {
                 remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
             }
         }
+        updates.flush();
 
         info!(
             "GC completed removing {} layers, cutoff {}",
@@ -3081,11 +3268,13 @@ impl Timeline {
                     // Delta- or ImageLayer in the layer map.
                     let new_layer = remote_layer.create_downloaded_layer(self_clone.conf, *size);
                     let mut layers = self_clone.layers.write().unwrap();
+                    let mut updates = layers.batch_update();
                     {
                         let l: Arc<dyn PersistentLayer> = remote_layer.clone();
-                        layers.remove_historic(l);
+                        updates.remove_historic(l);
                     }
-                    layers.insert_historic(new_layer);
+                    updates.insert_historic(new_layer);
+                    updates.flush();
                     drop(layers);
 
                     // Now that we've inserted the download into the layer map,
diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
similarity index 83%
rename from pageserver/src/walreceiver.rs
rename to pageserver/src/tenant/timeline/walreceiver.rs
index fc9daadc5c..f33a12c5cc 100644
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -23,58 +23,15 @@
 mod connection_manager;
 mod walreceiver_connection;
 
-use crate::config::PageServerConf;
 use crate::task_mgr::WALRECEIVER_RUNTIME;
 
-use anyhow::Context;
-use once_cell::sync::OnceCell;
 use std::future::Future;
-use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 
 pub use connection_manager::spawn_connection_manager_task;
 
-static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
-
-///
-/// Initialize the broker client. This must be called once at page server startup.
-///
-pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
-    let broker_endpoint = conf.broker_endpoint.clone();
-
-    // Note: we do not attempt connecting here (but validate endpoints sanity).
-    let broker_client =
-        storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
-            format!(
-                "Failed to create broker client to {}",
-                &conf.broker_endpoint
-            ),
-        )?;
-
-    if BROKER_CLIENT.set(broker_client).is_err() {
-        panic!("broker already initialized");
-    }
-
-    info!(
-        "Initialized broker client with endpoints: {}",
-        broker_endpoint
-    );
-    Ok(())
-}
-
-///
-/// Get a handle to the broker client
-///
-pub fn get_broker_client() -> &'static BrokerClientChannel {
-    BROKER_CLIENT.get().expect("broker client not initialized")
-}
-
-pub fn is_broker_client_initialized() -> bool {
-    BROKER_CLIENT.get().is_some()
-}
-
 /// A handle of an asynchronous task.
 /// The task has a channel that it can use to communicate its lifecycle events in a certain form, see [`TaskEvent`]
 /// and a cancellation token that it can listen to for earlier interrupts.
@@ -95,7 +52,6 @@ pub enum TaskEvent<E> {
 
 #[derive(Debug, Clone)]
 pub enum TaskStateUpdate<E> {
-    Init,
     Started,
     Progress(E),
 }
diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
similarity index 96%
rename from pageserver/src/walreceiver/connection_manager.rs
rename to pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 8b60e59305..cd7c7c51d2 100644
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -11,10 +11,12 @@
 
 use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration};
 
-use crate::task_mgr::TaskKind;
+use super::TaskStateUpdate;
+use crate::broker_client::get_broker_client;
+use crate::context::RequestContext;
 use crate::task_mgr::WALRECEIVER_RUNTIME;
+use crate::task_mgr::{self, TaskKind};
 use crate::tenant::Timeline;
-use crate::{task_mgr, walreceiver::TaskStateUpdate};
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 use pageserver_api::models::TimelineState;
@@ -27,10 +29,7 @@ use storage_broker::Streaming;
 use tokio::{select, sync::watch};
 use tracing::*;
 
-use crate::{
-    exponential_backoff, walreceiver::get_broker_client, DEFAULT_BASE_BACKOFF_SECONDS,
-    DEFAULT_MAX_BACKOFF_SECONDS,
-};
+use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use postgres_connection::{parse_host_port, PgConnectionConfig};
 use utils::{
     id::{NodeId, TenantTimelineId},
@@ -46,6 +45,7 @@ pub fn spawn_connection_manager_task(
     lagging_wal_timeout: Duration,
     max_lsn_wal_lag: NonZeroU64,
     auth_token: Option<Arc<String>>,
+    ctx: RequestContext,
 ) {
     let mut broker_client = get_broker_client().clone();
 
@@ -78,6 +78,7 @@ pub fn spawn_connection_manager_task(
                     loop_step_result = connection_manager_loop_step(
                         &mut broker_client,
                         &mut walreceiver_state,
+                        &ctx,
                     ) => match loop_step_result {
                         ControlFlow::Continue(()) => continue,
                         ControlFlow::Break(()) => {
@@ -101,6 +102,7 @@ pub fn spawn_connection_manager_task(
 async fn connection_manager_loop_step(
     broker_client: &mut BrokerClientChannel,
     walreceiver_state: &mut WalreceiverState,
+    ctx: &RequestContext,
 ) -> ControlFlow<(), ()> {
     let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates();
 
@@ -145,7 +147,7 @@ async fn connection_manager_loop_step(
                 let wal_connection = walreceiver_state.wal_connection.as_mut()
                     .expect("Should have a connection, as checked by the corresponding select! guard");
                 match wal_connection_update {
-                    TaskEvent::Update(TaskStateUpdate::Init | TaskStateUpdate::Started) => {},
+                    TaskEvent::Update(TaskStateUpdate::Started) => {},
                     TaskEvent::Update(TaskStateUpdate::Progress(new_status)) => {
                         if new_status.has_processed_wal {
                             // We have advanced last_record_lsn by processing the WAL received
@@ -183,13 +185,23 @@ async fn connection_manager_loop_step(
 
             new_event = async {
                 loop {
+                    if walreceiver_state.timeline.current_state() == TimelineState::Loading {
+                        warn!("wal connection manager should only be launched after timeline has become active");
+                    }
                     match timeline_state_updates.changed().await {
                         Ok(()) => {
                             let new_state = walreceiver_state.timeline.current_state();
                             match new_state {
                                 // we're already active as walreceiver, no need to reactivate
                                 TimelineState::Active => continue,
-                                TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return ControlFlow::Continue(new_state),
+                                TimelineState::Broken | TimelineState::Stopping => {
+                                    info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
+                                    return ControlFlow::Break(());
+                                }
+                                TimelineState::Loading => {
+                                    warn!("timeline transitioned back to Loading state, that should not happen");
+                                    return ControlFlow::Continue(new_state);
+                                }
                             }
                         }
                         Err(_sender_dropped_error) => return ControlFlow::Break(()),
@@ -197,7 +209,7 @@ async fn connection_manager_loop_step(
                 }
             } => match new_event {
                 ControlFlow::Continue(new_state) => {
-                    info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates");
+                    info!("observed timeline state change, new state is {new_state:?}");
                     return ControlFlow::Continue(());
                 }
                 ControlFlow::Break(()) => {
@@ -226,6 +238,7 @@ async fn connection_manager_loop_step(
                 .change_connection(
                     new_candidate.safekeeper_id,
                     new_candidate.wal_source_connconf,
+                    ctx,
                 )
                 .await
         }
@@ -289,7 +302,9 @@ async fn subscribe_for_timeline_updates(
                 return resp.into_inner();
             }
             Err(e) => {
-                warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
+                // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
+                // entire WAL is streamed. Keep this noticeable with logging, but do not warn/error.
+                info!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
                 continue;
             }
         }
@@ -389,12 +404,17 @@ impl WalreceiverState {
         &mut self,
         new_sk_id: NodeId,
         new_wal_source_connconf: PgConnectionConfig,
+        ctx: &RequestContext,
     ) {
         self.drop_old_connection(true).await;
 
         let id = self.id;
         let connect_timeout = self.wal_connect_timeout;
         let timeline = Arc::clone(&self.timeline);
+        let ctx = ctx.detached_child(
+            TaskKind::WalReceiverConnectionHandler,
+            ctx.download_behavior(),
+        );
         let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
             async move {
                 super::walreceiver_connection::handle_walreceiver_connection(
@@ -403,6 +423,7 @@ impl WalreceiverState {
                     events_sender,
                     cancellation,
                     connect_timeout,
+                    ctx,
                 )
                 .await
                 .context("walreceiver connection handling failure")
@@ -1233,18 +1254,18 @@ mod tests {
     const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
 
     async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
+        let (tenant, ctx) = harness.load().await;
+        let timeline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
+            .expect("Failed to create an empty timeline for dummy wal connection manager");
+        let timeline = timeline.initialize(&ctx).unwrap();
+
         WalreceiverState {
             id: TenantTimelineId {
                 tenant_id: harness.tenant_id,
                 timeline_id: TIMELINE_ID,
             },
-            timeline: harness
-                .load()
-                .await
-                .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION)
-                .expect("Failed to create an empty timeline for dummy wal connection manager")
-                .initialize()
-                .unwrap(),
+            timeline,
             wal_connect_timeout: Duration::from_secs(1),
             lagging_wal_timeout: Duration::from_secs(1),
             max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
similarity index 94%
rename from pageserver/src/walreceiver/walreceiver_connection.rs
rename to pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 1b9e4923fb..7e06c398af 100644
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -22,7 +22,9 @@ use tokio_postgres::{replication::ReplicationStream, Client};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn};
 
-use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
+use super::TaskStateUpdate;
+use crate::context::RequestContext;
+use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::{
     task_mgr,
     task_mgr::TaskKind,
@@ -62,6 +64,7 @@ pub async fn handle_walreceiver_connection(
     events_sender: watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
     cancellation: CancellationToken,
     connect_timeout: Duration,
+    ctx: RequestContext,
 ) -> anyhow::Result<()> {
     // Connect to the database in replication mode.
     info!("connecting to {wal_source_connconf:?}");
@@ -77,9 +80,13 @@ pub async fn handle_walreceiver_connection(
                 info!("DB connection stream finished: {expected_error}");
                 return Ok(());
             }
-            Err(elapsed) => anyhow::bail!(
-                "Timed out while waiting {elapsed} for walreceiver connection to open"
-            ),
+            Err(_) => {
+                // Timing out to connect to a safekeeper node could happen long time, due to
+                // many reasons that pageserver cannot control.
+                // Do not produce an error, but make it visible, that timeouts happen by logging the `event.
+                info!("Timed out while waiting {connect_timeout:?} for walreceiver connection to open");
+                return Ok(());
+            }
         }
     };
 
@@ -99,10 +106,14 @@ pub async fn handle_walreceiver_connection(
 
     // The connection object performs the actual communication with the database,
     // so spawn it off to run on its own.
+    let _connection_ctx = ctx.detached_child(
+        TaskKind::WalReceiverConnectionPoller,
+        ctx.download_behavior(),
+    );
     let connection_cancellation = cancellation.clone();
     task_mgr::spawn(
         WALRECEIVER_RUNTIME.handle(),
-        TaskKind::WalReceiverConnection,
+        TaskKind::WalReceiverConnectionPoller,
         Some(timeline.tenant_id),
         Some(timeline.timeline_id),
         "walreceiver connection",
@@ -117,7 +128,7 @@ pub async fn handle_walreceiver_connection(
                         }
                     }
                 },
-
+                // Future: replace connection_cancellation with connection_ctx cancellation
                 _ = connection_cancellation.cancelled() => info!("Connection cancelled"),
             }
             Ok(())
@@ -180,7 +191,7 @@ pub async fn handle_walreceiver_connection(
 
     let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
 
-    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint).await?;
+    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
 
     while let Some(replication_message) = {
         select! {
@@ -251,7 +262,7 @@ pub async fn handle_walreceiver_connection(
                         ensure!(lsn.is_aligned());
 
                         walingest
-                            .ingest_record(recdata.clone(), lsn, &mut modification, &mut decoded)
+                            .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
                             .await
                             .with_context(|| format!("could not ingest record at {lsn}"))?;
 
@@ -329,7 +340,7 @@ pub async fn handle_walreceiver_connection(
             // Send the replication feedback message.
             // Regular standby_status_update fields are put into this message.
             let (timeline_logical_size, _) = timeline
-                .get_current_logical_size()
+                .get_current_logical_size(&ctx)
                 .context("Status update creation failed to get current logical size")?;
             let status_update = ReplicationFeedback {
                 current_timeline_size: timeline_logical_size,
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 0de2e6654d..3761c65668 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -29,6 +29,7 @@ use anyhow::Result;
 use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
 
+use crate::context::RequestContext;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
@@ -52,10 +53,14 @@ pub struct WalIngest<'a> {
 }
 
 impl<'a> WalIngest<'a> {
-    pub async fn new(timeline: &Timeline, startpoint: Lsn) -> anyhow::Result<WalIngest> {
+    pub async fn new(
+        timeline: &'a Timeline,
+        startpoint: Lsn,
+        ctx: &'_ RequestContext,
+    ) -> anyhow::Result<WalIngest<'a>> {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
-        let checkpoint_bytes = timeline.get_checkpoint(startpoint).await?;
+        let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
         let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
         trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
 
@@ -80,6 +85,7 @@ impl<'a> WalIngest<'a> {
         lsn: Lsn,
         modification: &mut DatadirModification<'_>,
         decoded: &mut DecodedWALRecord,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         modification.lsn = lsn;
         decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
@@ -97,7 +103,7 @@ impl<'a> WalIngest<'a> {
         if decoded.xl_rmid == pg_constants::RM_HEAP_ID
             || decoded.xl_rmid == pg_constants::RM_HEAP2_ID
         {
-            self.ingest_heapam_record(&mut buf, modification, decoded)
+            self.ingest_heapam_record(&mut buf, modification, decoded, ctx)
                 .await?;
         }
         // Handle other special record types
@@ -106,13 +112,14 @@ impl<'a> WalIngest<'a> {
                 == pg_constants::XLOG_SMGR_CREATE
         {
             let create = XlSmgrCreate::decode(&mut buf);
-            self.ingest_xlog_smgr_create(modification, &create).await?;
+            self.ingest_xlog_smgr_create(modification, &create, ctx)
+                .await?;
         } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID
             && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                 == pg_constants::XLOG_SMGR_TRUNCATE
         {
             let truncate = XlSmgrTruncate::decode(&mut buf);
-            self.ingest_xlog_smgr_truncate(modification, &truncate)
+            self.ingest_xlog_smgr_truncate(modification, &truncate, ctx)
                 .await?;
         } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
             debug!(
@@ -126,7 +133,7 @@ impl<'a> WalIngest<'a> {
                     let createdb = XlCreateDatabase::decode(&mut buf);
                     debug!("XLOG_DBASE_CREATE v14");
 
-                    self.ingest_xlog_dbase_create(modification, &createdb)
+                    self.ingest_xlog_dbase_create(modification, &createdb, ctx)
                         .await?;
                 } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                     == postgres_ffi::v14::bindings::XLOG_DBASE_DROP
@@ -134,7 +141,9 @@ impl<'a> WalIngest<'a> {
                     let dropdb = XlDropDatabase::decode(&mut buf);
                     for tablespace_id in dropdb.tablespace_ids {
                         trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        modification.drop_dbdir(tablespace_id, dropdb.db_id).await?;
+                        modification
+                            .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
+                            .await?;
                     }
                 }
             } else if self.timeline.pg_version == 15 {
@@ -150,7 +159,7 @@ impl<'a> WalIngest<'a> {
                     // So we can reuse XlCreateDatabase here.
                     debug!("XLOG_DBASE_CREATE_FILE_COPY");
                     let createdb = XlCreateDatabase::decode(&mut buf);
-                    self.ingest_xlog_dbase_create(modification, &createdb)
+                    self.ingest_xlog_dbase_create(modification, &createdb, ctx)
                         .await?;
                 } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                     == postgres_ffi::v15::bindings::XLOG_DBASE_DROP
@@ -158,7 +167,9 @@ impl<'a> WalIngest<'a> {
                     let dropdb = XlDropDatabase::decode(&mut buf);
                     for tablespace_id in dropdb.tablespace_ids {
                         trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        modification.drop_dbdir(tablespace_id, dropdb.db_id).await?;
+                        modification
+                            .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
+                            .await?;
                     }
                 }
             }
@@ -176,12 +187,13 @@ impl<'a> WalIngest<'a> {
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
+                    ctx,
                 )
                 .await?;
             } else {
                 assert!(info == pg_constants::CLOG_TRUNCATE);
                 let xlrec = XlClogTruncate::decode(&mut buf);
-                self.ingest_clog_truncate_record(modification, &xlrec)
+                self.ingest_clog_truncate_record(modification, &xlrec, ctx)
                     .await?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
@@ -193,6 +205,7 @@ impl<'a> WalIngest<'a> {
                     modification,
                     &parsed_xact,
                     info == pg_constants::XLOG_XACT_COMMIT,
+                    ctx,
                 )
                 .await?;
             } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED
@@ -204,6 +217,7 @@ impl<'a> WalIngest<'a> {
                     modification,
                     &parsed_xact,
                     info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
+                    ctx,
                 )
                 .await?;
                 // Remove twophase file. see RemoveTwoPhaseFile() in postgres code
@@ -213,10 +227,12 @@ impl<'a> WalIngest<'a> {
                     parsed_xact.xid,
                     lsn,
                 );
-                modification.drop_twophase_file(parsed_xact.xid).await?;
+                modification
+                    .drop_twophase_file(parsed_xact.xid, ctx)
+                    .await?;
             } else if info == pg_constants::XLOG_XACT_PREPARE {
                 modification
-                    .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))
+                    .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx)
                     .await?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
@@ -232,6 +248,7 @@ impl<'a> WalIngest<'a> {
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
+                    ctx,
                 )
                 .await?;
             } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
@@ -244,6 +261,7 @@ impl<'a> WalIngest<'a> {
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
+                    ctx,
                 )
                 .await?;
             } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
@@ -251,12 +269,12 @@ impl<'a> WalIngest<'a> {
                 self.ingest_multixact_create_record(modification, &xlrec)?;
             } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
                 let xlrec = XlMultiXactTruncate::decode(&mut buf);
-                self.ingest_multixact_truncate_record(modification, &xlrec)
+                self.ingest_multixact_truncate_record(modification, &xlrec, ctx)
                     .await?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
             let xlrec = XlRelmapUpdate::decode(&mut buf);
-            self.ingest_relmap_page(modification, &xlrec, decoded)
+            self.ingest_relmap_page(modification, &xlrec, decoded, ctx)
                 .await?;
         } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
             let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
@@ -292,7 +310,7 @@ impl<'a> WalIngest<'a> {
         // Iterate through all the blocks that the record modifies, and
         // "put" a separate copy of the record for each block.
         for blk in decoded.blocks.iter() {
-            self.ingest_decoded_block(modification, lsn, decoded, blk)
+            self.ingest_decoded_block(modification, lsn, decoded, blk, ctx)
                 .await?;
         }
 
@@ -317,6 +335,7 @@ impl<'a> WalIngest<'a> {
         lsn: Lsn,
         decoded: &DecodedWALRecord,
         blk: &DecodedBkpBlock,
+        ctx: &RequestContext,
     ) -> Result<(), PageReconstructError> {
         let rel = RelTag {
             spcnode: blk.rnode_spcnode,
@@ -359,14 +378,14 @@ impl<'a> WalIngest<'a> {
                 page_set_lsn(&mut image, lsn)
             }
             assert_eq!(image.len(), BLCKSZ as usize);
-            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())
+            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze(), ctx)
                 .await?;
         } else {
             let rec = NeonWalRecord::Postgres {
                 will_init: blk.will_init || blk.apply_image,
                 rec: decoded.record.clone(),
             };
-            self.put_rel_wal_record(modification, rel, blk.blkno, rec)
+            self.put_rel_wal_record(modification, rel, blk.blkno, rec, ctx)
                 .await?;
         }
         Ok(())
@@ -377,6 +396,7 @@ impl<'a> WalIngest<'a> {
         buf: &mut Bytes,
         modification: &mut DatadirModification<'_>,
         decoded: &mut DecodedWALRecord,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Handle VM bit updates that are implicitly part of heap records.
 
@@ -456,7 +476,7 @@ impl<'a> WalIngest<'a> {
             // replaying it would fail to find the previous image of the page, because
             // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
             // record if it doesn't.
-            let vm_size = self.get_relsize(vm_rel, modification.lsn).await?;
+            let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
             if let Some(blknum) = new_vm_blk {
                 if blknum >= vm_size {
                     new_vm_blk = None;
@@ -481,6 +501,7 @@ impl<'a> WalIngest<'a> {
                             old_heap_blkno,
                             flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                         },
+                        ctx,
                     )
                     .await?;
                 } else {
@@ -496,6 +517,7 @@ impl<'a> WalIngest<'a> {
                                 old_heap_blkno: None,
                                 flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                             },
+                            ctx,
                         )
                         .await?;
                     }
@@ -509,6 +531,7 @@ impl<'a> WalIngest<'a> {
                                 old_heap_blkno,
                                 flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                             },
+                            ctx,
                         )
                         .await?;
                     }
@@ -524,6 +547,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         rec: &XlCreateDatabase,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let db_id = rec.db_id;
         let tablespace_id = rec.tablespace_id;
@@ -539,7 +563,7 @@ impl<'a> WalIngest<'a> {
 
         let rels = modification
             .tline
-            .list_rels(src_tablespace_id, src_db_id, req_lsn)
+            .list_rels(src_tablespace_id, src_db_id, req_lsn, ctx)
             .await?;
 
         debug!("ingest_xlog_dbase_create: {} rels", rels.len());
@@ -547,10 +571,10 @@ impl<'a> WalIngest<'a> {
         // Copy relfilemap
         let filemap = modification
             .tline
-            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)
+            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn, ctx)
             .await?;
         modification
-            .put_relmap_file(tablespace_id, db_id, filemap)
+            .put_relmap_file(tablespace_id, db_id, filemap, ctx)
             .await?;
 
         let mut num_rels_copied = 0;
@@ -561,7 +585,7 @@ impl<'a> WalIngest<'a> {
 
             let nblocks = modification
                 .tline
-                .get_rel_size(src_rel, req_lsn, true)
+                .get_rel_size(src_rel, req_lsn, true, ctx)
                 .await?;
             let dst_rel = RelTag {
                 spcnode: tablespace_id,
@@ -570,7 +594,7 @@ impl<'a> WalIngest<'a> {
                 forknum: src_rel.forknum,
             };
 
-            modification.put_rel_creation(dst_rel, nblocks).await?;
+            modification.put_rel_creation(dst_rel, nblocks, ctx).await?;
 
             // Copy content
             debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks);
@@ -579,7 +603,7 @@ impl<'a> WalIngest<'a> {
 
                 let content = modification
                     .tline
-                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)
+                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true, ctx)
                     .await?;
                 modification.put_rel_page_image(dst_rel, blknum, content)?;
                 num_blocks_copied += 1;
@@ -599,6 +623,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         rec: &XlSmgrCreate,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let rel = RelTag {
             spcnode: rec.rnode.spcnode,
@@ -606,7 +631,7 @@ impl<'a> WalIngest<'a> {
             relnode: rec.rnode.relnode,
             forknum: rec.forknum,
         };
-        self.put_rel_creation(modification, rel).await?;
+        self.put_rel_creation(modification, rel, ctx).await?;
         Ok(())
     }
 
@@ -617,6 +642,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         rec: &XlSmgrTruncate,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let spcnode = rec.rnode.spcnode;
         let dbnode = rec.rnode.dbnode;
@@ -629,7 +655,7 @@ impl<'a> WalIngest<'a> {
                 relnode,
                 forknum: MAIN_FORKNUM,
             };
-            self.put_rel_truncation(modification, rel, rec.blkno)
+            self.put_rel_truncation(modification, rel, rec.blkno, ctx)
                 .await?;
         }
         if (rec.flags & pg_constants::SMGR_TRUNCATE_FSM) != 0 {
@@ -648,10 +674,10 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
                 fsm_physical_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn).await?;
+            let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
             if nblocks > fsm_physical_page_no {
                 // check if something to do: FSM is larger than truncate position
-                self.put_rel_truncation(modification, rel, fsm_physical_page_no)
+                self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
                     .await?;
             }
         }
@@ -670,10 +696,10 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
                 vm_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn).await?;
+            let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
             if nblocks > vm_page_no {
                 // check if something to do: VM is larger than truncate position
-                self.put_rel_truncation(modification, rel, vm_page_no)
+                self.put_rel_truncation(modification, rel, vm_page_no, ctx)
                     .await?;
             }
         }
@@ -687,6 +713,7 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         parsed: &XlXactParsedRecord,
         is_commit: bool,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Record update of CLOG pages
         let mut pageno = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE;
@@ -745,10 +772,10 @@ impl<'a> WalIngest<'a> {
                 let last_lsn = self.timeline.get_last_record_lsn();
                 if modification
                     .tline
-                    .get_rel_exists(rel, last_lsn, true)
+                    .get_rel_exists(rel, last_lsn, true, ctx)
                     .await?
                 {
-                    self.put_rel_drop(modification, rel).await?;
+                    self.put_rel_drop(modification, rel, ctx).await?;
                 }
             }
         }
@@ -759,6 +786,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         xlrec: &XlClogTruncate,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         info!(
             "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}",
@@ -799,16 +827,15 @@ impl<'a> WalIngest<'a> {
         // it. So we use the previous record's LSN in the get calls
         // instead.
         let req_lsn = modification.tline.get_last_record_lsn();
-
-        let slru_segments = modification
+        for segno in modification
             .tline
-            .list_slru_segments(SlruKind::Clog, req_lsn)
-            .await?;
-        for segno in slru_segments {
+            .list_slru_segments(SlruKind::Clog, req_lsn, ctx)
+            .await?
+        {
             let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
             if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
                 modification
-                    .drop_slru_segment(SlruKind::Clog, segno)
+                    .drop_slru_segment(SlruKind::Clog, segno, ctx)
                     .await?;
                 trace!("Drop CLOG segment {:>04X}", segno);
             }
@@ -900,6 +927,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         xlrec: &XlMultiXactTruncate,
+        ctx: &RequestContext,
     ) -> Result<()> {
         self.checkpoint.oldestMulti = xlrec.end_trunc_off;
         self.checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
@@ -915,7 +943,7 @@ impl<'a> WalIngest<'a> {
         // contain, possibly partially, valid data.
         while segment != endsegment {
             modification
-                .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32)
+                .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
                 .await?;
 
             /* move to next segment, handling wraparound correctly */
@@ -937,6 +965,7 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         xlrec: &XlRelmapUpdate,
         decoded: &DecodedWALRecord,
+        ctx: &RequestContext,
     ) -> Result<()> {
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
@@ -944,18 +973,22 @@ impl<'a> WalIngest<'a> {
         buf.advance(12);
 
         modification
-            .put_relmap_file(xlrec.tsid, xlrec.dbid, Bytes::copy_from_slice(&buf[..]))
-            .await?;
-
-        Ok(())
+            .put_relmap_file(
+                xlrec.tsid,
+                xlrec.dbid,
+                Bytes::copy_from_slice(&buf[..]),
+                ctx,
+            )
+            .await
     }
 
     async fn put_rel_creation(
         &mut self,
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
+        ctx: &RequestContext,
     ) -> Result<()> {
-        modification.put_rel_creation(rel, 0).await?;
+        modification.put_rel_creation(rel, 0, ctx).await?;
         Ok(())
     }
 
@@ -965,8 +998,10 @@ impl<'a> WalIngest<'a> {
         rel: RelTag,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> anyhow::Result<()> {
-        self.handle_rel_extend(modification, rel, blknum).await?;
+        ctx: &RequestContext,
+    ) -> Result<(), PageReconstructError> {
+        self.handle_rel_extend(modification, rel, blknum, ctx)
+            .await?;
         modification.put_rel_page_image(rel, blknum, img)?;
         Ok(())
     }
@@ -977,8 +1012,10 @@ impl<'a> WalIngest<'a> {
         rel: RelTag,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> anyhow::Result<()> {
-        self.handle_rel_extend(modification, rel, blknum).await?;
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        self.handle_rel_extend(modification, rel, blknum, ctx)
+            .await?;
         modification.put_rel_wal_record(rel, blknum, rec)?;
         Ok(())
     }
@@ -988,8 +1025,9 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        modification.put_rel_truncation(rel, nblocks).await?;
+        modification.put_rel_truncation(rel, nblocks, ctx).await?;
         Ok(())
     }
 
@@ -997,17 +1035,22 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
+        ctx: &RequestContext,
     ) -> Result<()> {
-        modification.put_rel_drop(rel).await?;
+        modification.put_rel_drop(rel, ctx).await?;
         Ok(())
     }
 
-    async fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result<BlockNumber> {
-        let exists = self.timeline.get_rel_exists(rel, lsn, true).await?;
-        let nblocks = if !exists {
+    async fn get_relsize(
+        &mut self,
+        rel: RelTag,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<BlockNumber> {
+        let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? {
             0
         } else {
-            self.timeline.get_rel_size(rel, lsn, true).await?
+            self.timeline.get_rel_size(rel, lsn, true, ctx).await?
         };
         Ok(nblocks)
     }
@@ -1017,23 +1060,28 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
         blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
+        ctx: &RequestContext,
+    ) -> Result<(), PageReconstructError> {
         let new_nblocks = blknum + 1;
         // Check if the relation exists. We implicitly create relations on first
         // record.
         // TODO: would be nice if to be more explicit about it
         let last_lsn = modification.lsn;
-        let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn, true).await? {
+        let old_nblocks = if !self
+            .timeline
+            .get_rel_exists(rel, last_lsn, true, ctx)
+            .await?
+        {
             // create it with 0 size initially, the logic below will extend it
-            modification.put_rel_creation(rel, 0).await?;
+            modification.put_rel_creation(rel, 0, ctx).await?;
             0
         } else {
-            self.timeline.get_rel_size(rel, last_lsn, true).await?
+            self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
         };
 
         if new_nblocks > old_nblocks {
             //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
-            modification.put_rel_extend(rel, new_nblocks).await?;
+            modification.put_rel_extend(rel, new_nblocks, ctx).await?;
 
             // fill the gap with zeros
             for gap_blknum in old_nblocks..blknum {
@@ -1050,8 +1098,9 @@ impl<'a> WalIngest<'a> {
         segno: u32,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> anyhow::Result<()> {
-        self.handle_slru_extend(modification, kind, segno, blknum)
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        self.handle_slru_extend(modification, kind, segno, blknum, ctx)
             .await?;
         modification.put_slru_page_image(kind, segno, blknum, img)?;
         Ok(())
@@ -1063,6 +1112,7 @@ impl<'a> WalIngest<'a> {
         kind: SlruKind,
         segno: u32,
         blknum: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // we don't use a cache for this like we do for relations. SLRUS are explcitly
         // extended with ZEROPAGE records, not with commit records, so it happens
@@ -1075,17 +1125,17 @@ impl<'a> WalIngest<'a> {
         let last_lsn = self.timeline.get_last_record_lsn();
         let old_nblocks = if !self
             .timeline
-            .get_slru_segment_exists(kind, segno, last_lsn)
+            .get_slru_segment_exists(kind, segno, last_lsn, ctx)
             .await?
         {
             // create it with 0 size initially, the logic below will extend it
             modification
-                .put_slru_segment_creation(kind, segno, 0)
+                .put_slru_segment_creation(kind, segno, 0, ctx)
                 .await?;
             0
         } else {
             self.timeline
-                .get_slru_segment_size(kind, segno, last_lsn)
+                .get_slru_segment_size(kind, segno, last_lsn, ctx)
                 .await?
         };
 
@@ -1134,41 +1184,44 @@ mod tests {
 
     static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
 
-    async fn init_walingest_test(tline: &Timeline) -> Result<WalIngest> {
+    async fn init_walingest_test<'a>(
+        tline: &'a Timeline,
+        ctx: &RequestContext,
+    ) -> Result<WalIngest<'a>> {
         let mut m = tline.begin_modification(Lsn(0x10));
         m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
-        m.put_relmap_file(0, 111, Bytes::from("")).await?; // dummy relmapper file
+        m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
         m.commit()?;
-        let walingest = WalIngest::new(tline, Lsn(0x10)).await?;
+        let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;
 
         Ok(walingest)
     }
 
     #[tokio::test]
     async fn test_relsize() -> Result<()> {
-        let tenant = TenantHarness::create("test_relsize")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline).await?;
+        let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let mut walingest = init_walingest_test(&tline, &ctx).await?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
-        walingest.put_rel_creation(&mut m, TESTREL_A).await?;
+        walingest.put_rel_creation(&mut m, TESTREL_A, &ctx).await?;
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
             .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x30));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
             .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))
+            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
             .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x50));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))
+            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
             .await?;
         m.commit()?;
 
@@ -1176,120 +1229,157 @@ mod tests {
 
         // The relation was created at LSN 2, not visible at LSN 1 yet.
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x10), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
+                .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Lsn(0x10), false)
+            .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
             .await
             .is_err());
-
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x20), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
             true
         );
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false).await?, 1);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false).await?, 3);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
+            1
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .await?,
+            3
+        );
 
         // Check page contents at each LSN
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 2")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 2 at 5")
         );
 
         // Truncate last block
         let mut m = tline.begin_modification(Lsn(0x60));
-        walingest.put_rel_truncation(&mut m, TESTREL_A, 2).await?;
+        walingest
+            .put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
+            .await?;
         m.commit()?;
         assert_current_logical_size(&tline, Lsn(0x60));
 
         // Check reported size and contents after truncation
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false).await?, 2);
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false)
+                .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
+                .await?,
+            2
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
 
         // should still see the truncated block with older LSN
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false).await?, 3);
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)
+                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .await?,
+            3
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 2 at 5")
         );
 
         // Truncate to zero length
         let mut m = tline.begin_modification(Lsn(0x68));
-        walingest.put_rel_truncation(&mut m, TESTREL_A, 0).await?;
+        walingest
+            .put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
+            .await?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68), false).await?, 0);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
+                .await?,
+            0
+        );
 
         // Extend from 0 to 2 blocks, leaving a gap
         let mut m = tline.begin_modification(Lsn(0x70));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))
+            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
             .await?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70), false).await?, 2);
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false)
+                .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
+                .await?,
+            2
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false, &ctx)
                 .await?,
             ZERO_PAGE
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1")
         );
@@ -1297,21 +1387,26 @@ mod tests {
         // Extend a lot more, leaving a big gap that spans across segments
         let mut m = tline.begin_modification(Lsn(0x80));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))
+            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
             .await?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false).await?, 1501);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
+                .await?,
+            1501
+        );
         for blk in 2..1500 {
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false)
+                    .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false, &ctx)
                     .await?,
                 ZERO_PAGE
             );
         }
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1500")
         );
@@ -1323,31 +1418,40 @@ mod tests {
     // and then created it again within the same layer.
     #[tokio::test]
     async fn test_drop_extend() -> Result<()> {
-        let tenant = TenantHarness::create("test_drop_extend")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline).await?;
+        let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let mut walingest = init_walingest_test(&tline, &ctx).await?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
             .await?;
         m.commit()?;
 
         // Check that rel exists and size is correct
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x20), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
             true
         );
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false).await?, 1);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
+            1
+        );
 
         // Drop rel
         let mut m = tline.begin_modification(Lsn(0x30));
-        walingest.put_rel_drop(&mut m, TESTREL_A).await?;
+        walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
         m.commit()?;
 
         // Check that rel is not visible anymore
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x30), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x30), false, &ctx)
+                .await?,
             false
         );
 
@@ -1357,16 +1461,23 @@ mod tests {
         // Re-create it
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
             .await?;
         m.commit()?;
 
         // Check that rel exists and size is correct
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x40), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x40), false, &ctx)
+                .await?,
             true
         );
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40), false).await?, 1);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x40), false, &ctx)
+                .await?,
+            1
+        );
 
         Ok(())
     }
@@ -1376,9 +1487,9 @@ mod tests {
     // and then extended it again within the same layer.
     #[tokio::test]
     async fn test_truncate_extend() -> Result<()> {
-        let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline).await?;
+        let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let mut walingest = init_walingest_test(&tline, &ctx).await?;
 
         // Create a 20 MB relation (the size is arbitrary)
         let relsize = 20 * 1024 * 1024 / 8192;
@@ -1386,27 +1497,33 @@ mod tests {
         for blkno in 0..relsize {
             let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
             walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                 .await?;
         }
         m.commit()?;
 
         // The relation was created at LSN 20, not visible at LSN 1 yet.
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x10), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
+                .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Lsn(0x10), false)
+            .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
             .await
             .is_err());
 
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x20), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
             true
         );
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(0x20), false).await?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
             relsize
         );
 
@@ -1416,7 +1533,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -1425,18 +1542,25 @@ mod tests {
         // Truncate relation so that second segment was dropped
         // - only leave one page
         let mut m = tline.begin_modification(Lsn(0x60));
-        walingest.put_rel_truncation(&mut m, TESTREL_A, 1).await?;
+        walingest
+            .put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
+            .await?;
         m.commit()?;
 
         // Check reported size and contents after truncation
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false).await?, 1);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
+                .await?,
+            1
+        );
 
         for blkno in 0..1 {
             let lsn = Lsn(0x20);
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -1444,7 +1568,9 @@ mod tests {
 
         // should still see all blocks with older LSN
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(0x50), false).await?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .await?,
             relsize
         );
         for blkno in 0..relsize {
@@ -1452,7 +1578,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -1465,17 +1591,21 @@ mod tests {
         for blkno in 0..relsize {
             let data = format!("foo blk {} at {}", blkno, lsn);
             walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                 .await?;
         }
         m.commit()?;
 
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x80), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x80), false, &ctx)
+                .await?,
             true
         );
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(0x80), false).await?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
+                .await?,
             relsize
         );
         // Check relation content
@@ -1484,7 +1614,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -1497,9 +1627,9 @@ mod tests {
     /// split into multiple 1 GB segments in Postgres.
     #[tokio::test]
     async fn test_large_rel() -> Result<()> {
-        let tenant = TenantHarness::create("test_large_rel")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline).await?;
+        let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let mut walingest = init_walingest_test(&tline, &ctx).await?;
 
         let mut lsn = 0x10;
         for blknum in 0..RELSEG_SIZE + 1 {
@@ -1507,7 +1637,7 @@ mod tests {
             let mut m = tline.begin_modification(Lsn(lsn));
             let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
             walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)
+                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
                 .await?;
             m.commit()?;
         }
@@ -1515,7 +1645,7 @@ mod tests {
         assert_current_logical_size(&tline, Lsn(lsn));
 
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
+            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
             RELSEG_SIZE + 1
         );
 
@@ -1523,11 +1653,11 @@ mod tests {
         lsn += 0x10;
         let mut m = tline.begin_modification(Lsn(lsn));
         walingest
-            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)
+            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
             .await?;
         m.commit()?;
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
+            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
             RELSEG_SIZE
         );
         assert_current_logical_size(&tline, Lsn(lsn));
@@ -1536,11 +1666,11 @@ mod tests {
         lsn += 0x10;
         let mut m = tline.begin_modification(Lsn(lsn));
         walingest
-            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)
+            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
             .await?;
         m.commit()?;
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
+            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
             RELSEG_SIZE - 1
         );
         assert_current_logical_size(&tline, Lsn(lsn));
@@ -1552,11 +1682,11 @@ mod tests {
             lsn += 0x10;
             let mut m = tline.begin_modification(Lsn(lsn));
             walingest
-                .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)
+                .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
                 .await?;
             m.commit()?;
             assert_eq!(
-                tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
+                tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
                 size as BlockNumber
             );
 
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index fd0524016f..c943bf0a27 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,16 +22,18 @@ use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
 use serde::Serialize;
+use std::collections::VecDeque;
 use std::fs::OpenOptions;
 use std::io::prelude::*;
 use std::io::{Error, ErrorKind};
 use std::ops::{Deref, DerefMut};
+use std::os::fd::RawFd;
 use std::os::unix::io::AsRawFd;
 use std::os::unix::prelude::CommandExt;
 use std::path::PathBuf;
 use std::process::Stdio;
 use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
-use std::sync::Mutex;
+use std::sync::{Mutex, MutexGuard};
 use std::time::Duration;
 use std::time::Instant;
 use std::{fs, io};
@@ -90,6 +92,20 @@ pub trait WalRedoManager: Send + Sync {
     ) -> Result<Bytes, WalRedoError>;
 }
 
+struct ProcessInput {
+    child: NoLeakChild,
+    stdin: ChildStdin,
+    stderr_fd: RawFd,
+    stdout_fd: RawFd,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
+
 ///
 /// This is the real implementation that uses a Postgres process to
 /// perform WAL replay. Only one thread can use the process at a time,
@@ -101,7 +117,9 @@ pub struct PostgresRedoManager {
     tenant_id: TenantId,
     conf: &'static PageServerConf,
 
-    process: Mutex<Option<PostgresRedoProcess>>,
+    stdout: Mutex<Option<ProcessOutput>>,
+    stdin: Mutex<Option<ProcessInput>>,
+    stderr: Mutex<Option<ChildStderr>>,
 }
 
 /// Can this request be served by neon redo functions
@@ -209,16 +227,17 @@ impl PostgresRedoManager {
         PostgresRedoManager {
             tenant_id,
             conf,
-            process: Mutex::new(None),
+            stdin: Mutex::new(None),
+            stdout: Mutex::new(None),
+            stderr: Mutex::new(None),
         }
     }
 
     /// Launch process pre-emptively. Should not be needed except for benchmarking.
-    pub fn launch_process(&mut self, pg_version: u32) -> anyhow::Result<()> {
-        let inner = self.process.get_mut().unwrap();
-        if inner.is_none() {
-            let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
-            *inner = Some(p);
+    pub fn launch_process(&self, pg_version: u32) -> anyhow::Result<()> {
+        let mut proc = self.stdin.lock().unwrap();
+        if proc.is_none() {
+            self.launch(&mut proc, pg_version)?;
         }
         Ok(())
     }
@@ -241,22 +260,19 @@ impl PostgresRedoManager {
 
         let start_time = Instant::now();
 
-        let mut process_guard = self.process.lock().unwrap();
+        let mut proc = self.stdin.lock().unwrap();
         let lock_time = Instant::now();
 
         // launch the WAL redo process on first use
-        if process_guard.is_none() {
-            let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
-            *process_guard = Some(p);
+        if proc.is_none() {
+            self.launch(&mut proc, pg_version)?;
         }
-        let process = process_guard.as_mut().unwrap();
-
         WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
 
         // Relational WAL records are applied using wal-redo-postgres
         let buf_tag = BufferTag { rel, blknum };
-        let result = process
-            .apply_wal_records(buf_tag, base_img, records, wal_redo_timeout)
+        let result = self
+            .apply_wal_records(proc, buf_tag, base_img, records, wal_redo_timeout)
             .map_err(WalRedoError::IoError);
 
         let end_time = Instant::now();
@@ -295,8 +311,22 @@ impl PostgresRedoManager {
 				base_img_lsn,
                 lsn
             );
-            let process = process_guard.take().unwrap();
-            process.kill();
+            // self.stdin only holds stdin & stderr as_raw_fd().
+            // Dropping it as part of take() doesn't close them.
+            // The owning objects (ChildStdout and ChildStderr) are stored in
+            // self.stdout and self.stderr, respsectively.
+            // We intentionally keep them open here to avoid a race between
+            // currently running `apply_wal_records()` and a `launch()` call
+            // after we return here.
+            // The currently running `apply_wal_records()` must not read from
+            // the newly launched process.
+            // By keeping self.stdout and self.stderr open here, `launch()` will
+            // get other file descriptors for the new child's stdout and stderr,
+            // and hence the current `apply_wal_records()` calls will observe
+            //  `output.stdout.as_raw_fd() != stdout_fd` .
+            if let Some(proc) = self.stdin.lock().unwrap().take() {
+                proc.child.kill_and_wait();
+            }
         }
         result
     }
@@ -595,32 +625,23 @@ impl<C: CommandExt> CloseFileDescriptors for C {
     }
 }
 
-///
-/// Handle to the Postgres WAL redo process
-///
-struct PostgresRedoProcess {
-    tenant_id: TenantId,
-    child: NoLeakChild,
-    stdin: ChildStdin,
-    stdout: ChildStdout,
-    stderr: ChildStderr,
-}
-
-impl PostgresRedoProcess {
+impl PostgresRedoManager {
     //
     // Start postgres binary in special WAL redo mode.
     //
-    #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
+    #[instrument(skip_all,fields(tenant_id=%self.tenant_id, pg_version=pg_version))]
     fn launch(
-        conf: &PageServerConf,
-        tenant_id: TenantId,
+        &self,
+        input: &mut MutexGuard<Option<ProcessInput>>,
         pg_version: u32,
-    ) -> Result<PostgresRedoProcess, Error> {
+    ) -> Result<(), Error> {
         // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
         // just create one with constant name. That fails if you try to launch more than
         // one WAL redo manager concurrently.
         let datadir = path_with_suffix_extension(
-            conf.tenant_path(&tenant_id).join("wal-redo-datadir"),
+            self.conf
+                .tenant_path(&self.tenant_id)
+                .join("wal-redo-datadir"),
             TEMP_FILE_SUFFIX,
         );
 
@@ -634,10 +655,12 @@ impl PostgresRedoProcess {
                 )
             })?;
         }
-        let pg_bin_dir_path = conf
+        let pg_bin_dir_path = self
+            .conf
             .pg_bin_dir(pg_version)
             .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_bin_dir path: {e}")))?;
-        let pg_lib_dir_path = conf
+        let pg_lib_dir_path = self
+            .conf
             .pg_lib_dir(pg_version)
             .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_lib_dir path: {e}")))?;
 
@@ -723,27 +746,31 @@ impl PostgresRedoProcess {
         // all fallible operations post-spawn are complete, so get rid of the guard
         let child = scopeguard::ScopeGuard::into_inner(child);
 
-        Ok(PostgresRedoProcess {
-            tenant_id,
+        **input = Some(ProcessInput {
             child,
+            stdout_fd: stdout.as_raw_fd(),
+            stderr_fd: stderr.as_raw_fd(),
             stdin,
+            n_requests: 0,
+        });
+
+        *self.stdout.lock().unwrap() = Some(ProcessOutput {
             stdout,
-            stderr,
-        })
+            pending_responses: VecDeque::new(),
+            n_processed_responses: 0,
+        });
+        *self.stderr.lock().unwrap() = Some(stderr);
+
+        Ok(())
     }
 
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
-    fn kill(self) {
-        self.child.kill_and_wait();
-    }
-
-    //
     // Apply given WAL records ('records') over an old page image. Returns
     // new page image.
     //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%input.as_ref().unwrap().child.id()))]
     fn apply_wal_records(
-        &mut self,
+        &self,
+        mut input: MutexGuard<Option<ProcessInput>>,
         tag: BufferTag,
         base_img: Option<Bytes>,
         records: &[(Lsn, NeonWalRecord)],
@@ -780,33 +807,23 @@ impl PostgresRedoProcess {
         build_get_page_msg(tag, &mut writebuf);
         WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
 
-        // The input is now in 'writebuf'. Do a blind write first, writing as much as
-        // we can, before calling poll(). That skips one call to poll() if the stdin is
-        // already available for writing, which it almost certainly is because the
-        // process is idle.
-        let mut nwrite = self.stdin.write(&writebuf)?;
-
-        // We expect the WAL redo process to respond with an 8k page image. We read it
-        // into this buffer.
-        let mut resultbuf = vec![0; BLCKSZ.into()];
-        let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+        let proc = input.as_mut().unwrap();
+        let mut nwrite = 0usize;
+        let stdout_fd = proc.stdout_fd;
 
         // Prepare for calling poll()
         let mut pollfds = [
-            PollFd::new(self.stdout.as_raw_fd(), PollFlags::POLLIN),
-            PollFd::new(self.stderr.as_raw_fd(), PollFlags::POLLIN),
-            PollFd::new(self.stdin.as_raw_fd(), PollFlags::POLLOUT),
+            PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT),
+            PollFd::new(proc.stderr_fd, PollFlags::POLLIN),
+            PollFd::new(stdout_fd, PollFlags::POLLIN),
         ];
 
-        // We do three things simultaneously: send the old base image and WAL records to
-        // the child process's stdin, read the result from child's stdout, and forward any logging
+        // We do two things simultaneously: send the old base image and WAL records to
+        // the child process's stdin and forward any logging
         // information that the child writes to its stderr to the page server's log.
-        while nresult < BLCKSZ.into() {
-            // If we have more data to write, wake up if 'stdin' becomes writeable or
-            // we have data to read. Otherwise only wake up if there's data to read.
-            let nfds = if nwrite < writebuf.len() { 3 } else { 2 };
+        while nwrite < writebuf.len() {
             let n = loop {
-                match nix::poll::poll(&mut pollfds[0..nfds], wal_redo_timeout.as_millis() as i32) {
+                match nix::poll::poll(&mut pollfds[0..2], wal_redo_timeout.as_millis() as i32) {
                     Err(e) if e == nix::errno::Errno::EINTR => continue,
                     res => break res,
                 }
@@ -820,14 +837,16 @@ impl PostgresRedoProcess {
             let err_revents = pollfds[1].revents().unwrap();
             if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
                 let mut errbuf: [u8; 16384] = [0; 16384];
-                let n = self.stderr.read(&mut errbuf)?;
+                let mut stderr_guard = self.stderr.lock().unwrap();
+                let stderr = stderr_guard.as_mut().unwrap();
+                let len = stderr.read(&mut errbuf)?;
 
                 // The message might not be split correctly into lines here. But this is
                 // good enough, the important thing is to get the message to the log.
-                if n > 0 {
+                if len > 0 {
                     error!(
                         "wal-redo-postgres: {}",
-                        String::from_utf8_lossy(&errbuf[0..n])
+                        String::from_utf8_lossy(&errbuf[0..len])
                     );
 
                     // To make sure we capture all log from the process if it fails, keep
@@ -841,33 +860,157 @@ impl PostgresRedoProcess {
                 ));
             }
 
-            // If we have more data to write and 'stdin' is writeable, do write.
-            if nwrite < writebuf.len() {
-                let in_revents = pollfds[2].revents().unwrap();
-                if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                    nwrite += self.stdin.write(&writebuf[nwrite..])?;
-                } else if in_revents.contains(PollFlags::POLLHUP) {
-                    // We still have more data to write, but the process closed the pipe.
-                    return Err(Error::new(
-                        ErrorKind::BrokenPipe,
-                        "WAL redo process closed its stdin unexpectedly",
-                    ));
-                }
-            }
-
-            // If we have some data in stdout, read it to the result buffer.
-            let out_revents = pollfds[0].revents().unwrap();
-            if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                nresult += self.stdout.read(&mut resultbuf[nresult..])?;
-            } else if out_revents.contains(PollFlags::POLLHUP) {
+            // If 'stdin' is writeable, do write.
+            let in_revents = pollfds[0].revents().unwrap();
+            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
+                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
+            } else if in_revents.contains(PollFlags::POLLHUP) {
+                // We still have more data to write, but the process closed the pipe.
                 return Err(Error::new(
                     ErrorKind::BrokenPipe,
-                    "WAL redo process closed its stdout unexpectedly",
+                    "WAL redo process closed its stdin unexpectedly",
                 ));
             }
         }
+        let request_no = proc.n_requests;
+        proc.n_requests += 1;
+        drop(input);
 
-        Ok(Bytes::from(resultbuf))
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut output_guard = self.stdout.lock().unwrap();
+        let output = output_guard.as_mut().unwrap();
+        if output.stdout.as_raw_fd() != stdout_fd {
+            // If stdout file descriptor is changed then it means that walredo process is crashed and restarted.
+            // As far as ProcessInput and ProcessOutout are protected by different mutexes,
+            // it can happen that we send request to one process and waiting response from another.
+            // To prevent such situation we compare stdout file descriptors.
+            // As far as old stdout pipe is destroyed only after new one is created,
+            // it can not reuse the same file descriptor, so this check is safe.
+            //
+            // Cross-read this with the comment in apply_batch_postgres if result.is_err().
+            // That's where we kill the child process.
+            return Err(Error::new(
+                ErrorKind::BrokenPipe,
+                "WAL redo process closed its stdout unexpectedly",
+            ));
+        }
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+            while nresult < BLCKSZ.into() {
+                // We do two things simultaneously: reading response from stdout
+                // and forward any logging information that the child writes to its stderr to the page server's log.
+                let n = loop {
+                    match nix::poll::poll(&mut pollfds[1..3], wal_redo_timeout.as_millis() as i32) {
+                        Err(e) if e == nix::errno::Errno::EINTR => continue,
+                        res => break res,
+                    }
+                }?;
+
+                if n == 0 {
+                    return Err(Error::new(ErrorKind::Other, "WAL redo timed out"));
+                }
+
+                // If we have some messages in stderr, forward them to the log.
+                let err_revents = pollfds[1].revents().unwrap();
+                if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                    let mut errbuf: [u8; 16384] = [0; 16384];
+                    let mut stderr_guard = self.stderr.lock().unwrap();
+                    let stderr = stderr_guard.as_mut().unwrap();
+                    let len = stderr.read(&mut errbuf)?;
+
+                    // The message might not be split correctly into lines here. But this is
+                    // good enough, the important thing is to get the message to the log.
+                    if len > 0 {
+                        error!(
+                            "wal-redo-postgres: {}",
+                            String::from_utf8_lossy(&errbuf[0..len])
+                        );
+
+                        // To make sure we capture all log from the process if it fails, keep
+                        // reading from the stderr, before checking the stdout.
+                        continue;
+                    }
+                } else if err_revents.contains(PollFlags::POLLHUP) {
+                    return Err(Error::new(
+                        ErrorKind::BrokenPipe,
+                        "WAL redo process closed its stderr unexpectedly",
+                    ));
+                }
+
+                // If we have some data in stdout, read it to the result buffer.
+                let out_revents = pollfds[2].revents().unwrap();
+                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
+                } else if out_revents.contains(PollFlags::POLLHUP) {
+                    return Err(Error::new(
+                        ErrorKind::BrokenPipe,
+                        "WAL redo process closed its stdout unexpectedly",
+                    ));
+                }
+            }
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        Ok(res)
     }
 }
 
diff --git a/poetry.lock b/poetry.lock
index edbcddd576..fc37124184 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,3 +1,21 @@
+[[package]]
+name = "aiohttp"
+version = "3.7.0"
+description = "Async http client/server framework (asyncio)"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+async-timeout = ">=3.0,<4.0"
+attrs = ">=17.3.0"
+chardet = ">=2.0,<4.0"
+multidict = ">=4.5,<7.0"
+yarl = ">=1.0,<2.0"
+
+[package.extras]
+speedups = ["aiodns", "brotlipy", "cchardet"]
+
 [[package]]
 name = "aiopg"
 version = "1.3.4"
@@ -41,11 +59,11 @@ six = ">=1.9.0"
 
 [[package]]
 name = "async-timeout"
-version = "4.0.2"
+version = "3.0.1"
 description = "Timeout context manager for asyncio programs"
 category = "main"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.5.3"
 
 [[package]]
 name = "asyncpg"
@@ -560,6 +578,14 @@ networkx = ">=2.4,<3.0"
 pyyaml = ">5.4"
 sarif-om = ">=1.0.4,<1.1.0"
 
+[[package]]
+name = "chardet"
+version = "3.0.4"
+description = "Universal encoding detector for Python 2 and 3"
+category = "main"
+optional = false
+python-versions = "*"
+
 [[package]]
 name = "charset-normalizer"
 version = "2.1.0"
@@ -939,6 +965,14 @@ server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)"
 ssm = ["PyYAML (>=5.1)", "dataclasses"]
 xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"]
 
+[[package]]
+name = "multidict"
+version = "6.0.4"
+description = "multidict implementation"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
 [[package]]
 name = "mypy"
 version = "0.991"
@@ -1580,6 +1614,18 @@ category = "main"
 optional = false
 python-versions = ">=3.4"
 
+[[package]]
+name = "yarl"
+version = "1.8.2"
+description = "Yet another URL library"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+idna = ">=2.0"
+multidict = ">=4.0"
+
 [[package]]
 name = "zipp"
 version = "3.8.1"
@@ -1595,9 +1641,44 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "af44b269c235a6fd59dacb4ff9e05cbc13a79b57254a8d5d4bde934bd5691a70"
+content-hash = "0f7289ef9439d1d7cd36b07efb53741b773669b0f860189c800270b7def0c241"
 
 [metadata.files]
+aiohttp = [
+    {file = "aiohttp-3.7.0-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:72fe89f7e14939e896d984c4b592580f8cdfa7497feb1c0c24639a9c60be3eb9"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:fdf778d4c4bf976e69a37213fe8083613d0851976ddcf485bd7c0650a43d3852"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:fee7b5e68939ffc09f9b29f167ed49c8b50de3eee0a1d8108b439ddd9963af46"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_i686.whl", hash = "sha256:dd64634713be409202058f2ea267dfbcdd74b387b8793425f21ef0266d45d0e9"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_ppc64le.whl", hash = "sha256:713dd7fd70ddda9dc8d014c49dd0e55b58afe4e0cddb8722c7501f53edf30c3f"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_s390x.whl", hash = "sha256:d31c43f7c4948ce01957f9a1ceee0784e067778477557ebccdf805398331c1a1"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:5e26d6003eb6df304608d9fd9c9437065a8532d869a3ffcbd8113a3d710f8239"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-win_amd64.whl", hash = "sha256:bf08462cddd10ddd8ffe5cb5c1638bfa051290909ebedb31c06e46578b9b7529"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:07bacf6721db51a4c6160ed3031a2a97910647969dafd7c653f600f3b542f463"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:245b58e30bc889d18b783db2f09ef1d814f466e15c84325410827451297003a0"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b392e5c3e122586c49cd8b9426f577bf4d51958933b839d158d28b69515af74e"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:5b5c320621a171aa85f96909af28fbb5286bd6842066db3062b083ba92261256"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_ppc64le.whl", hash = "sha256:97d2341d1360dbe2c5b1d94922f7d68f9ce2ded1daab88b9bdeb49ce419cdc1b"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:beda23f292716887532661dc19abb9db2302ccfbd671a080cd8f4be7463d0841"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:cbcaae9a6f14f762348d19b2dce8162772c0b0a1739314e18492a308a22caf96"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-win_amd64.whl", hash = "sha256:7a49ef7b691babc83db126db874fbf26ba2f781899b91399f9ff8b235f059245"},
+    {file = "aiohttp-3.7.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:f56892f57310415cf6a179eec3ea6c7a82a9d37fbc00894943ea3154011a6d2a"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:df1274b7620c32d3b15bfb0a8fb3165dd6cdc9c39f4db74d162f051c80826542"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:a04ba359dc5f2e21b96bfc90c4a7665441441ba61b52e992b7799493889a3419"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:f548d7976d168f0f45ac5909ca5f606ae3f6f7aa1725b22504004a053b29a7d0"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_ppc64le.whl", hash = "sha256:deef02e2a9f5095463098c7c22d5566f20a6e4e14fc0996c0c2efc74d461b680"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:fe44c96bc380588d36729392b602470d88a7c18e646e95dd4348cafe3900d91d"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:9210532e6e95b40d22a33415bb84423eef3f633b2d2339b97f3b26438eebc466"},
+    {file = "aiohttp-3.7.0-cp38-cp38-win_amd64.whl", hash = "sha256:a586e476a251483d222c73dfb2f27df90bc4ea1b8c7da9396236510e0d4046c8"},
+    {file = "aiohttp-3.7.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:900012c5f12ff72b1453229afe288ddc9135176df8b3b3cc5b8f6cfde912aaa4"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux1_i686.whl", hash = "sha256:064d5f0738bcbab3e0c0ecf85c93b5ee1e07e124f994eaa03bf73687f3ecd9da"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:0a2edf27865e66a33f64fa793cd14d0aae8127ce20a858539e97c25b600556dc"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:eaa8ae734639d5a0a3b5e33a154b8bfef384cdc090706f95c387cae8b21af764"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_ppc64le.whl", hash = "sha256:a8a42f05491d9c04a77806875a68f84fea9af7a59d47b7897cb166632f74606c"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:b19ded3f6957693b97ba8372aacb5b0021639bbd5e77b1e960796bcef5431969"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:cefbd7ce7d1f1db43749a077e4970e29e2b631f367c9eff3862c3c886b4218dd"},
+    {file = "aiohttp-3.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:7d64f7dfd4e326d9b0d11b07fcd5ebf78844ba3c8f7699f38b50b0e0db0ae68f"},
+    {file = "aiohttp-3.7.0.tar.gz", hash = "sha256:176f1d2b2bc07044f4ed583216578a72a2bd35dffdeb92e0517d0aaa29d29549"},
+]
 aiopg = [
     {file = "aiopg-1.3.4-py3-none-any.whl", hash = "sha256:b5b74a124831aad71608c3c203479db90bac4a7eb3f8982bc48c3d3e6f1e57bf"},
     {file = "aiopg-1.3.4.tar.gz", hash = "sha256:23f9e4cd9f28e9d91a6de3b4fb517e8bed25511cd954acccba9fe3a702d9b7d0"},
@@ -1611,8 +1692,8 @@ allure-python-commons = [
     {file = "allure_python_commons-2.10.0-py3-none-any.whl", hash = "sha256:2a717e8ca8d296bf89cd57f38fc3c21893bd7ea8cd02a6ae5420e6d1a6eda5d0"},
 ]
 async-timeout = [
-    {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"},
-    {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"},
+    {file = "async-timeout-3.0.1.tar.gz", hash = "sha256:0c3c816a028d47f659d6ff5c745cb2acf1f966da1fe5c19c77a70282b25f4c5f"},
+    {file = "async_timeout-3.0.1-py3-none-any.whl", hash = "sha256:4291ca197d287d274d0b6cb5d6f8f8f82d434ed288f962539ff18cc9012f9ea3"},
 ]
 asyncpg = [
     {file = "asyncpg-0.27.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fca608d199ffed4903dce1bcd97ad0fe8260f405c1c225bdf0002709132171c2"},
@@ -1787,6 +1868,10 @@ cfn-lint = [
     {file = "cfn-lint-0.61.3.tar.gz", hash = "sha256:3806e010d77901f5e935496df690c10e39676434a738fce1a1161cf9c7bd36a2"},
     {file = "cfn_lint-0.61.3-py3-none-any.whl", hash = "sha256:8e9522fad0c7c98b31ecbdd4724f8d8a5787457cc0f71e62ae0d11104d6e52ab"},
 ]
+chardet = [
+    {file = "chardet-3.0.4-py2.py3-none-any.whl", hash = "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"},
+    {file = "chardet-3.0.4.tar.gz", hash = "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"},
+]
 charset-normalizer = [
     {file = "charset-normalizer-2.1.0.tar.gz", hash = "sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413"},
     {file = "charset_normalizer-2.1.0-py3-none-any.whl", hash = "sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5"},
@@ -1960,6 +2045,82 @@ moto = [
     {file = "moto-3.1.18-py3-none-any.whl", hash = "sha256:b6eb096e7880c46ac44d6d90988c0043e31462115cfdc913a0ee8f470bd9555c"},
     {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"},
 ]
+multidict = [
+    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"},
+    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"},
+    {file = "multidict-6.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5"},
+    {file = "multidict-6.0.4-cp310-cp310-win32.whl", hash = "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8"},
+    {file = "multidict-6.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc"},
+    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03"},
+    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3"},
+    {file = "multidict-6.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461"},
+    {file = "multidict-6.0.4-cp311-cp311-win32.whl", hash = "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636"},
+    {file = "multidict-6.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0"},
+    {file = "multidict-6.0.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d"},
+    {file = "multidict-6.0.4-cp37-cp37m-win32.whl", hash = "sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775"},
+    {file = "multidict-6.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e"},
+    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c"},
+    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161"},
+    {file = "multidict-6.0.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1"},
+    {file = "multidict-6.0.4-cp38-cp38-win32.whl", hash = "sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779"},
+    {file = "multidict-6.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480"},
+    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664"},
+    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35"},
+    {file = "multidict-6.0.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95"},
+    {file = "multidict-6.0.4-cp39-cp39-win32.whl", hash = "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313"},
+    {file = "multidict-6.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2"},
+    {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
+]
 mypy = [
     {file = "mypy-0.991-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab"},
     {file = "mypy-0.991-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d"},
@@ -2412,6 +2573,82 @@ xmltodict = [
     {file = "xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852"},
     {file = "xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56"},
 ]
+yarl = [
+    {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bb81f753c815f6b8e2ddd2eef3c855cf7da193b82396ac013c661aaa6cc6b0a5"},
+    {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:47d49ac96156f0928f002e2424299b2c91d9db73e08c4cd6742923a086f1c863"},
+    {file = "yarl-1.8.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3fc056e35fa6fba63248d93ff6e672c096f95f7836938241ebc8260e062832fe"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58a3c13d1c3005dbbac5c9f0d3210b60220a65a999b1833aa46bd6677c69b08e"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10b08293cda921157f1e7c2790999d903b3fd28cd5c208cf8826b3b508026996"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de986979bbd87272fe557e0a8fcb66fd40ae2ddfe28a8b1ce4eae22681728fef"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c4fcfa71e2c6a3cb568cf81aadc12768b9995323186a10827beccf5fa23d4f8"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae4d7ff1049f36accde9e1ef7301912a751e5bae0a9d142459646114c70ecba6"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bf071f797aec5b96abfc735ab97da9fd8f8768b43ce2abd85356a3127909d146"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:74dece2bfc60f0f70907c34b857ee98f2c6dd0f75185db133770cd67300d505f"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:df60a94d332158b444301c7f569659c926168e4d4aad2cfbf4bce0e8fb8be826"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:63243b21c6e28ec2375f932a10ce7eda65139b5b854c0f6b82ed945ba526bff3"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cfa2bbca929aa742b5084fd4663dd4b87c191c844326fcb21c3afd2d11497f80"},
+    {file = "yarl-1.8.2-cp310-cp310-win32.whl", hash = "sha256:b05df9ea7496df11b710081bd90ecc3a3db6adb4fee36f6a411e7bc91a18aa42"},
+    {file = "yarl-1.8.2-cp310-cp310-win_amd64.whl", hash = "sha256:24ad1d10c9db1953291f56b5fe76203977f1ed05f82d09ec97acb623a7976574"},
+    {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2a1fca9588f360036242f379bfea2b8b44cae2721859b1c56d033adfd5893634"},
+    {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f37db05c6051eff17bc832914fe46869f8849de5b92dc4a3466cd63095d23dfd"},
+    {file = "yarl-1.8.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:77e913b846a6b9c5f767b14dc1e759e5aff05502fe73079f6f4176359d832581"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0978f29222e649c351b173da2b9b4665ad1feb8d1daa9d971eb90df08702668a"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388a45dc77198b2460eac0aca1efd6a7c09e976ee768b0d5109173e521a19daf"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2305517e332a862ef75be8fad3606ea10108662bc6fe08509d5ca99503ac2aee"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42430ff511571940d51e75cf42f1e4dbdded477e71c1b7a17f4da76c1da8ea76"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3150078118f62371375e1e69b13b48288e44f6691c1069340081c3fd12c94d5b"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c15163b6125db87c8f53c98baa5e785782078fbd2dbeaa04c6141935eb6dab7a"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d04acba75c72e6eb90745447d69f84e6c9056390f7a9724605ca9c56b4afcc6"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e7fd20d6576c10306dea2d6a5765f46f0ac5d6f53436217913e952d19237efc4"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:75c16b2a900b3536dfc7014905a128a2bea8fb01f9ee26d2d7d8db0a08e7cb2c"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6d88056a04860a98341a0cf53e950e3ac9f4e51d1b6f61a53b0609df342cc8b2"},
+    {file = "yarl-1.8.2-cp311-cp311-win32.whl", hash = "sha256:fb742dcdd5eec9f26b61224c23baea46c9055cf16f62475e11b9b15dfd5c117b"},
+    {file = "yarl-1.8.2-cp311-cp311-win_amd64.whl", hash = "sha256:8c46d3d89902c393a1d1e243ac847e0442d0196bbd81aecc94fcebbc2fd5857c"},
+    {file = "yarl-1.8.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ceff9722e0df2e0a9e8a79c610842004fa54e5b309fe6d218e47cd52f791d7ef"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f6b4aca43b602ba0f1459de647af954769919c4714706be36af670a5f44c9c1"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1684a9bd9077e922300ecd48003ddae7a7474e0412bea38d4631443a91d61077"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ebb78745273e51b9832ef90c0898501006670d6e059f2cdb0e999494eb1450c2"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3adeef150d528ded2a8e734ebf9ae2e658f4c49bf413f5f157a470e17a4a2e89"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a7c87927a468e5a1dc60c17caf9597161d66457a34273ab1760219953f7f4c"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:efff27bd8cbe1f9bd127e7894942ccc20c857aa8b5a0327874f30201e5ce83d0"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a783cd344113cb88c5ff7ca32f1f16532a6f2142185147822187913eb989f739"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:705227dccbe96ab02c7cb2c43e1228e2826e7ead880bb19ec94ef279e9555b5b"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:34c09b43bd538bf6c4b891ecce94b6fa4f1f10663a8d4ca589a079a5018f6ed7"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a48f4f7fea9a51098b02209d90297ac324241bf37ff6be6d2b0149ab2bd51b37"},
+    {file = "yarl-1.8.2-cp37-cp37m-win32.whl", hash = "sha256:0414fd91ce0b763d4eadb4456795b307a71524dbacd015c657bb2a39db2eab89"},
+    {file = "yarl-1.8.2-cp37-cp37m-win_amd64.whl", hash = "sha256:d881d152ae0007809c2c02e22aa534e702f12071e6b285e90945aa3c376463c5"},
+    {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5df5e3d04101c1e5c3b1d69710b0574171cc02fddc4b23d1b2813e75f35a30b1"},
+    {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7a66c506ec67eb3159eea5096acd05f5e788ceec7b96087d30c7d2865a243918"},
+    {file = "yarl-1.8.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2b4fa2606adf392051d990c3b3877d768771adc3faf2e117b9de7eb977741229"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e21fb44e1eff06dd6ef971d4bdc611807d6bd3691223d9c01a18cec3677939e"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93202666046d9edadfe9f2e7bf5e0782ea0d497b6d63da322e541665d65a044e"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc77086ce244453e074e445104f0ecb27530d6fd3a46698e33f6c38951d5a0f1"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dd68a92cab699a233641f5929a40f02a4ede8c009068ca8aa1fe87b8c20ae3"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b372aad2b5f81db66ee7ec085cbad72c4da660d994e8e590c997e9b01e44901"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e6f3515aafe0209dd17fb9bdd3b4e892963370b3de781f53e1746a521fb39fc0"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dfef7350ee369197106805e193d420b75467b6cceac646ea5ed3049fcc950a05"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:728be34f70a190566d20aa13dc1f01dc44b6aa74580e10a3fb159691bc76909d"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ff205b58dc2929191f68162633d5e10e8044398d7a45265f90a0f1d51f85f72c"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:baf211dcad448a87a0d9047dc8282d7de59473ade7d7fdf22150b1d23859f946"},
+    {file = "yarl-1.8.2-cp38-cp38-win32.whl", hash = "sha256:272b4f1599f1b621bf2aabe4e5b54f39a933971f4e7c9aa311d6d7dc06965165"},
+    {file = "yarl-1.8.2-cp38-cp38-win_amd64.whl", hash = "sha256:326dd1d3caf910cd26a26ccbfb84c03b608ba32499b5d6eeb09252c920bcbe4f"},
+    {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f8ca8ad414c85bbc50f49c0a106f951613dfa5f948ab69c10ce9b128d368baf8"},
+    {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:418857f837347e8aaef682679f41e36c24250097f9e2f315d39bae3a99a34cbf"},
+    {file = "yarl-1.8.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0eec05ab49e91a78700761777f284c2df119376e391db42c38ab46fd662b77"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:009a028127e0a1755c38b03244c0bea9d5565630db9c4cf9572496e947137a87"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3edac5d74bb3209c418805bda77f973117836e1de7c000e9755e572c1f7850d0"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:da65c3f263729e47351261351b8679c6429151ef9649bba08ef2528ff2c423b2"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ef8fb25e52663a1c85d608f6dd72e19bd390e2ecaf29c17fb08f730226e3a08"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcd7bb1e5c45274af9a1dd7494d3c52b2be5e6bd8d7e49c612705fd45420b12d"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:44ceac0450e648de86da8e42674f9b7077d763ea80c8ceb9d1c3e41f0f0a9951"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:97209cc91189b48e7cfe777237c04af8e7cc51eb369004e061809bcdf4e55220"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:48dd18adcf98ea9cd721a25313aef49d70d413a999d7d89df44f469edfb38a06"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e59399dda559688461762800d7fb34d9e8a6a7444fd76ec33220a926c8be1516"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d617c241c8c3ad5c4e78a08429fa49e4b04bedfc507b34b4d8dceb83b4af3588"},
+    {file = "yarl-1.8.2-cp39-cp39-win32.whl", hash = "sha256:cb6d48d80a41f68de41212f3dfd1a9d9898d7841c8f7ce6696cf2fd9cb57ef83"},
+    {file = "yarl-1.8.2-cp39-cp39-win_amd64.whl", hash = "sha256:6604711362f2dbf7160df21c416f81fac0de6dbcf0b5445a2ef25478ecc4c778"},
+    {file = "yarl-1.8.2.tar.gz", hash = "sha256:49d43402c6e3013ad0978602bf6bf5328535c48d192304b91b97a3c6790b1562"},
+]
 zipp = [
     {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"},
     {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"},
diff --git a/proxy/src/main.rs b/proxy/src/main.rs
index 5d44774df9..1b61ab108f 100644
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -30,7 +30,7 @@ use std::{borrow::Cow, future::Future, net::SocketAddr};
 use tokio::{net::TcpListener, task::JoinError};
 use tracing::{info, info_span, Instrument};
 use utils::project_git_version;
-use utils::sentry_init::{init_sentry, release_name};
+use utils::sentry_init::init_sentry;
 
 project_git_version!(GIT_VERSION);
 
@@ -49,7 +49,7 @@ async fn main() -> anyhow::Result<()> {
         .init();
 
     // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(release_name!(), &[]);
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
     let arg_matches = cli().get_matches();
 
diff --git a/pyproject.toml b/pyproject.toml
index b4fb7a9e7d..a817e9dda5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,6 +33,7 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.4"
 types-toml = "^0.10.8"
 pytest-httpserver = "^1.0.6"
+aiohttp = "3.7"
 
 [tool.poetry.dev-dependencies]
 flake8 = "^5.0.4"
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index b130ea86bd..1a068412c8 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -38,7 +38,7 @@ use utils::{
     id::NodeId,
     logging::{self, LogFormat},
     project_git_version,
-    sentry_init::{init_sentry, release_name},
+    sentry_init::init_sentry,
     signals, tcp_listener,
 };
 
@@ -173,7 +173,10 @@ fn main() -> anyhow::Result<()> {
     };
 
     // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.my_id.to_string())]);
+    let _sentry_guard = init_sentry(
+        Some(GIT_VERSION.into()),
+        &[("node_id", &conf.my_id.to_string())],
+    );
     start_safekeeper(conf)
 }
 
diff --git a/scripts/force_layer_download.py b/scripts/force_layer_download.py
new file mode 100644
index 0000000000..5472d86d8f
--- /dev/null
+++ b/scripts/force_layer_download.py
@@ -0,0 +1,324 @@
+import argparse
+import asyncio
+import json
+import logging
+import signal
+import sys
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Awaitable, Dict, List, Tuple
+
+import aiohttp
+
+
+class ClientException(Exception):
+    pass
+
+
+class Client:
+    def __init__(self, pageserver_api_endpoint: str, max_concurrent_layer_downloads: int):
+        self.endpoint = pageserver_api_endpoint
+        self.max_concurrent_layer_downloads = max_concurrent_layer_downloads
+        self.sess = aiohttp.ClientSession()
+
+    async def close(self):
+        await self.sess.close()
+
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, exc_t, exc_v, exc_tb):
+        await self.close()
+
+    async def parse_response(self, resp, expected_type):
+        body = await resp.json()
+        if not resp.ok:
+            raise ClientException(f"Response: {resp} Body: {body}")
+
+        if not isinstance(body, expected_type):
+            raise ClientException(f"expecting {expected_type.__name__}")
+        return body
+
+    async def get_tenant_ids(self):
+        resp = await self.sess.get(f"{self.endpoint}/v1/tenant")
+        payload = await self.parse_response(resp=resp, expected_type=list)
+        return [t["id"] for t in payload]
+
+    async def get_timeline_ids(self, tenant_id):
+        resp = await self.sess.get(f"{self.endpoint}/v1/tenant/{tenant_id}/timeline")
+        payload = await self.parse_response(resp=resp, expected_type=list)
+        return [t["timeline_id"] for t in payload]
+
+    async def timeline_spawn_download_remote_layers(self, tenant_id, timeline_id, ongoing_ok=False):
+        resp = await self.sess.post(
+            f"{self.endpoint}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
+            json={"max_concurrent_downloads": self.max_concurrent_layer_downloads},
+        )
+        body = await resp.json()
+        if resp.status == 409:
+            if not ongoing_ok:
+                raise ClientException("download already ongoing")
+            # response body has same shape for ongoing and newly created
+        elif not resp.ok:
+            raise ClientException(f"Response: {resp} Body: {body}")
+
+        if not isinstance(body, dict):
+            raise ClientException("expecting dict")
+
+        return body
+
+    async def timeline_poll_download_remote_layers_status(
+        self,
+        tenant_id,
+        timeline_id,
+    ):
+        resp = await self.sess.get(
+            f"{self.endpoint}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
+        )
+        body = await resp.json()
+
+        if resp.status == 404:
+            return None
+        elif not resp.ok:
+            raise ClientException(f"Response: {resp} Body: {body}")
+
+        return body
+
+
+@dataclass
+class Completed:
+    """The status dict returned by the API"""
+
+    status: Dict[str, Any]
+
+
+sigint_received = asyncio.Event()
+
+
+async def do_timeline(client: Client, tenant_id, timeline_id):
+    """
+    Spawn download_remote_layers task for given timeline,
+    then poll until the download has reached a terminal state.
+
+    If the terminal state is not 'Completed', the method raises an exception.
+    The caller is responsible for inspecting `failed_download_count`.
+
+    If there is already a task going on when this method is invoked,
+    it raises an exception.
+    """
+
+    # Don't start new downloads if user pressed SIGINT.
+    # This task will show up as "raised_exception" in the report.
+    if sigint_received.is_set():
+        raise Exception("not starting because SIGINT received")
+
+    # run downloads to completion
+
+    status = await client.timeline_poll_download_remote_layers_status(tenant_id, timeline_id)
+    if status is not None and status["state"] == "Running":
+        raise Exception("download is already running")
+
+    spawned = await client.timeline_spawn_download_remote_layers(
+        tenant_id, timeline_id, ongoing_ok=False
+    )
+
+    while True:
+        st = await client.timeline_poll_download_remote_layers_status(tenant_id, timeline_id)
+        logging.info(f"{tenant_id}:{timeline_id} state is: {st}")
+
+        if spawned["task_id"] != st["task_id"]:
+            raise ClientException("download task ids changed while polling")
+
+        if st["state"] == "Running":
+            await asyncio.sleep(10)
+            continue
+
+        if st["state"] != "Completed":
+            raise ClientException(
+                f"download task reached terminal state != Completed: {st['state']}"
+            )
+
+        return Completed(st)
+
+
+def handle_sigint():
+    logging.info("SIGINT received, asyncio event set. Will not start new downloads.")
+    global sigint_received
+    sigint_received.set()
+
+
+async def main(args):
+    async with Client(args.pageserver_http_endpoint, args.max_concurrent_layer_downloads) as client:
+        exit_code = await main_impl(args, args.report_output, client)
+
+    return exit_code
+
+
+async def taskq_handler(task_q, result_q):
+    while True:
+        try:
+            (id, fut) = task_q.get_nowait()
+        except asyncio.QueueEmpty:
+            logging.debug("taskq_handler observed empty task_q, returning")
+            return
+        logging.info(f"starting task {id}")
+        try:
+            res = await fut
+        except Exception as e:
+            res = e
+        result_q.put_nowait((id, res))
+
+
+async def print_progress(result_q, tasks):
+    while True:
+        await asyncio.sleep(10)
+        logging.info(f"{result_q.qsize()} / {len(tasks)} tasks done")
+
+
+async def main_impl(args, report_out, client: Client):
+    """
+    Returns OS exit status.
+    """
+    tenant_and_timline_ids: List[Tuple[str, str]] = []
+    # fill tenant_and_timline_ids based on spec
+    for spec in args.what:
+        comps = spec.split(":")
+        if comps == ["ALL"]:
+            logging.info("get tenant list")
+            tenant_ids = await client.get_tenant_ids()
+            get_timeline_id_coros = [client.get_timeline_ids(tenant_id) for tenant_id in tenant_ids]
+            gathered = await asyncio.gather(*get_timeline_id_coros, return_exceptions=True)
+            assert len(tenant_ids) == len(gathered)
+            tenant_and_timline_ids = []
+            for tid, tlids in zip(tenant_ids, gathered):
+                for tlid in tlids:
+                    tenant_and_timline_ids.append((tid, tlid))
+        elif len(comps) == 1:
+            tid = comps[0]
+            tlids = await client.get_timeline_ids(tid)
+            for tlid in tlids:
+                tenant_and_timline_ids.append((tid, tlid))
+        elif len(comps) == 2:
+            tenant_and_timline_ids.append((comps[0], comps[1]))
+        else:
+            raise ValueError(f"invalid what-spec: {spec}")
+
+    logging.info("expanded spec:")
+    for tid, tlid in tenant_and_timline_ids:
+        logging.info(f"{tid}:{tlid}")
+
+    logging.info("remove duplicates after expanding spec")
+    tmp = list(set(tenant_and_timline_ids))
+    assert len(tmp) <= len(tenant_and_timline_ids)
+    if len(tmp) != len(tenant_and_timline_ids):
+        logging.info(f"spec had {len(tenant_and_timline_ids) - len(tmp)} duplicates")
+    tenant_and_timline_ids = tmp
+
+    logging.info("create tasks and process them at specified concurrency")
+    task_q: asyncio.Queue[Tuple[str, Awaitable[Any]]] = asyncio.Queue()
+    tasks = {
+        f"{tid}:{tlid}": do_timeline(client, tid, tlid) for tid, tlid in tenant_and_timline_ids
+    }
+    for task in tasks.items():
+        task_q.put_nowait(task)
+
+    result_q: asyncio.Queue[Tuple[str, Any]] = asyncio.Queue()
+    taskq_handlers = []
+    for _ in range(0, args.concurrent_tasks):
+        taskq_handlers.append(taskq_handler(task_q, result_q))
+
+    print_progress_task = asyncio.create_task(print_progress(result_q, tasks))
+
+    await asyncio.gather(*taskq_handlers)
+    print_progress_task.cancel()
+
+    logging.info("all tasks handled, generating report")
+
+    results = []
+    while True:
+        try:
+            results.append(result_q.get_nowait())
+        except asyncio.QueueEmpty:
+            break
+    assert task_q.empty()
+
+    report = defaultdict(list)
+    for id, result in results:
+        logging.info(f"result for {id}: {result}")
+        if isinstance(result, Completed):
+            if result.status["failed_download_count"] == 0:
+                report["completed_without_errors"].append(id)
+            else:
+                report["completed_with_download_errors"].append(id)
+        elif isinstance(result, Exception):
+            report["raised_exception"].append(id)
+        else:
+            raise ValueError("unexpected result type")
+    json.dump(report, report_out)
+
+    logging.info("--------------------------------------------------------------------------------")
+
+    report_success = len(report["completed_without_errors"]) == len(tenant_and_timline_ids)
+    if not report_success:
+        logging.error("One or more tasks encountered errors.")
+    else:
+        logging.info("All tasks reported success.")
+    logging.info("Inspect log for details and report file for JSON summary.")
+
+    return report_success
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--report-output",
+        type=argparse.FileType("w"),
+        default="-",
+        help="where to write report output (default: stdout)",
+    )
+    parser.add_argument(
+        "--pageserver-http-endpoint",
+        default="http://localhost:9898",
+        help="pageserver http endpoint, (default http://localhost:9898)",
+    )
+    parser.add_argument(
+        "--concurrent-tasks",
+        required=False,
+        default=5,
+        type=int,
+        help="Max concurrent download tasks created & polled by this script",
+    )
+    parser.add_argument(
+        "--max-concurrent-layer-downloads",
+        dest="max_concurrent_layer_downloads",
+        required=False,
+        default=8,
+        type=int,
+        help="Max concurrent download tasks spawned by pageserver. Each layer is a separate task.",
+    )
+
+    parser.add_argument(
+        "what",
+        nargs="+",
+        help="what to download: ALL|tenant_id|tenant_id:timeline_id",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="enable verbose logging",
+    )
+    args = parser.parse_args()
+
+    level = logging.INFO
+    if args.verbose:
+        level = logging.DEBUG
+    logging.basicConfig(
+        format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
+        datefmt="%Y-%m-%d:%H:%M:%S",
+        level=level,
+    )
+
+    loop = asyncio.get_event_loop()
+
+    loop.add_signal_handler(signal.SIGINT, handle_sigint)
+    sys.exit(asyncio.run(main(args)))
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 6d80e96bf1..e33369bbb1 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -45,7 +45,7 @@ use storage_broker::{
 use utils::id::TenantTimelineId;
 use utils::logging::{self, LogFormat};
 use utils::project_git_version;
-use utils::sentry_init::{init_sentry, release_name};
+use utils::sentry_init::init_sentry;
 
 project_git_version!(GIT_VERSION);
 
@@ -425,7 +425,7 @@ async fn http1_handler(
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
     // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(release_name!(), &[]);
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
     let args = Args::parse();
 
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 8b78e06c22..bdaaa95216 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -46,6 +46,12 @@ PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
     "pageserver_remote_physical_size",
 )
 
+PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
+    "pageserver_storage_operations_seconds_global_count",
+    "pageserver_storage_operations_seconds_global_sum",
+    "pageserver_storage_operations_seconds_global_bucket",
+)
+
 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_current_logical_size",
     "pageserver_resident_physical_size",
@@ -61,13 +67,13 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_smgr_query_seconds_bucket",
     "pageserver_smgr_query_seconds_count",
     "pageserver_smgr_query_seconds_sum",
-    "pageserver_storage_operations_seconds_bucket",
-    "pageserver_storage_operations_seconds_count",
-    "pageserver_storage_operations_seconds_sum",
+    "pageserver_storage_operations_seconds_count_total",
+    "pageserver_storage_operations_seconds_sum_total",
     "pageserver_wait_lsn_seconds_bucket",
     "pageserver_wait_lsn_seconds_count",
     "pageserver_wait_lsn_seconds_sum",
     "pageserver_created_persistent_files_total",
     "pageserver_written_persistent_bytes_total",
+    "pageserver_tenant_states_count",
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
 )
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 29cdcb18ce..cbbf01a285 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -2,7 +2,15 @@ from contextlib import closing
 
 import psycopg2.extras
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import (
+    LocalFsStorage,
+    NeonEnvBuilder,
+    RemoteStorageKind,
+    assert_tenant_status,
+    wait_for_upload,
+)
+from fixtures.types import Lsn
+from fixtures.utils import wait_until
 
 
 def test_tenant_config(neon_env_builder: NeonEnvBuilder):
@@ -57,7 +65,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "compaction_period": 20,
                     "compaction_threshold": 10,
                     "gc_horizon": 67108864,
-                    "gc_period": 100,
+                    "gc_period": 60 * 60,
                     "image_creation_threshold": 3,
                     "pitr_interval": 604800,  # 7 days
                 }.items()
@@ -158,3 +166,46 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "pitr_interval": 60,
                 }.items()
             )
+
+
+def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.LOCAL_FS,
+        test_name="test_creating_tenant_conf_after_attach",
+    )
+
+    env = neon_env_builder.init_start()
+    assert isinstance(env.remote_storage, LocalFsStorage)
+
+    # tenant is created with defaults, as in without config file
+    (tenant_id, timeline_id) = env.neon_cli.create_tenant()
+    config_path = env.repo_dir / "tenants" / str(tenant_id) / "config"
+    assert config_path.exists(), "config file is always initially created"
+
+    http_client = env.pageserver.http_client()
+
+    detail = http_client.timeline_detail(tenant_id, timeline_id)
+    last_record_lsn = Lsn(detail["last_record_lsn"])
+    assert last_record_lsn.lsn_int != 0, "initdb must have executed"
+
+    wait_for_upload(http_client, tenant_id, timeline_id, last_record_lsn)
+
+    http_client.tenant_detach(tenant_id)
+
+    assert not config_path.exists(), "detach did not remove config file"
+
+    http_client.tenant_attach(tenant_id)
+    wait_until(
+        number_of_iterations=5,
+        interval=1,
+        func=lambda: assert_tenant_status(http_client, tenant_id, "Active"),
+    )
+
+    env.neon_cli.config_tenant(tenant_id, {"gc_horizon": "1000000"})
+    contents_first = config_path.read_text()
+    env.neon_cli.config_tenant(tenant_id, {"gc_horizon": "0"})
+    contents_later = config_path.read_text()
+
+    # dont test applying the setting here, we have that another test case to show it
+    # we just care about being able to create the file
+    assert len(contents_first) > len(contents_later)
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index db5bb679f2..6c3454b79b 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -6,6 +6,7 @@ from threading import Thread
 import asyncpg
 import pytest
 from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
@@ -59,11 +60,11 @@ def test_tenant_reattach(
     # create new nenant
     tenant_id, timeline_id = env.neon_cli.create_tenant()
 
-    pg = env.postgres.create_start("main", tenant_id=tenant_id)
-    with pg.cursor() as cur:
-        cur.execute("CREATE TABLE t(key int primary key, value text)")
-        cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
-        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t(key int primary key, value text)")
+            cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
+            current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
 
     # Wait for the all data to be processed by the pageserver and uploaded in remote storage
     wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
@@ -78,15 +79,34 @@ def test_tenant_reattach(
         ".*failed to perform remote task UploadMetadata.*, will retry.*"
     )
 
+    ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver")
+    tenant_metric_filter = {
+        "tenant_id": str(tenant_id),
+        "timeline_id": str(timeline_id),
+    }
+    pageserver_last_record_lsn_before_detach = int(
+        ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
+    )
+
     pageserver_http.tenant_detach(tenant_id)
     pageserver_http.tenant_attach(tenant_id)
 
-    with pg.cursor() as cur:
-        assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
+    time.sleep(1)  # for metrics propagation
 
-    # Check that we had to retry the downloads
-    assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*")
-    assert env.pageserver.log_contains(".*download.*failed, will retry.*")
+    ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver")
+    pageserver_last_record_lsn = int(
+        ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
+    )
+
+    assert pageserver_last_record_lsn_before_detach == pageserver_last_record_lsn
+
+    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
+
+        # Check that we had to retry the downloads
+        assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*")
+        assert env.pageserver.log_contains(".*download.*failed, will retry.*")
 
 
 num_connections = 10
@@ -237,7 +257,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
 
-    env.pageserver.allowed_errors.append(".*NotFound\\(Tenant .* not found")
+    env.pageserver.allowed_errors.append(".*NotFound: Tenant .* not found")
 
     # first check for non existing tenant
     tenant_id = TenantId.generate()
@@ -272,8 +292,10 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
         bogus_timeline_id = TimelineId.generate()
         pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0)
 
-        # the error will be printed to the log too
+    # the error will be printed to the log too
     env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")
+    # Timelines get stopped during detach, ignore the gc calls that error, whitnessing that
+    env.pageserver.allowed_errors.append(".*InternalServerError\\(timeline is Stopping.*")
 
     # Detach while running manual GC.
     # It should wait for manual GC to finish because it runs in a task associated with the tenant.
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 9477ae3c25..e56bb1b469 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -1,5 +1,6 @@
 import os
 import shutil
+import time
 from contextlib import closing
 from datetime import datetime
 from pathlib import Path
@@ -8,6 +9,7 @@ from typing import List
 import pytest
 from fixtures.log_helper import log
 from fixtures.metrics import (
+    PAGESERVER_GLOBAL_METRICS,
     PAGESERVER_PER_TENANT_METRICS,
     PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     parse_metrics,
@@ -160,6 +162,14 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
             f"process_start_time_seconds (UTC): {datetime.fromtimestamp(metrics.query_one('process_start_time_seconds').value)}"
         )
 
+    # Test (a subset of) pageserver global metrics
+    for metric in PAGESERVER_GLOBAL_METRICS:
+        ps_samples = ps_metrics.query_all(metric, {})
+        assert len(ps_samples) > 0
+        for sample in ps_samples:
+            labels = ",".join([f'{key}="{value}"' for key, value in sample.labels.items()])
+            log.info(f"{sample.name}{{{labels}}} {sample.value}")
+
 
 @pytest.mark.parametrize(
     "remote_storage_kind",
@@ -259,7 +269,7 @@ def test_pageserver_with_empty_tenants(
         files_in_timelines_dir == 0
     ), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory"
 
-    # Trigger timeline reinitialization after pageserver restart
+    # Trigger timeline re-initialization after pageserver restart
     env.postgres.stop_all()
     env.pageserver.stop()
 
@@ -278,7 +288,51 @@ def test_pageserver_with_empty_tenants(
         broken_tenant["state"] == "Broken"
     ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
 
+    broken_tenant_status = client.tenant_status(tenant_without_timelines_dir)
+    assert (
+        broken_tenant_status["state"] == "Broken"
+    ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
+
+    assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*")
+
     [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)]
     assert (
         loaded_tenant["state"] == "Active"
     ), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation"
+
+    loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines_dir)
+    assert (
+        loaded_tenant_status["state"] == "Active"
+    ), f"Tenant {tenant_with_empty_timelines_dir} without timelines dir should be active"
+
+    time.sleep(1)  # to allow metrics propagation
+
+    ps_metrics = parse_metrics(client.get_metrics(), "pageserver")
+    broken_tenants_metric_filter = {
+        "tenant_id": str(tenant_without_timelines_dir),
+        "state": "broken",
+    }
+    active_tenants_metric_filter = {
+        "tenant_id": str(tenant_with_empty_timelines_dir),
+        "state": "active",
+    }
+
+    tenant_active_count = int(
+        ps_metrics.query_one(
+            "pageserver_tenant_states_count", filter=active_tenants_metric_filter
+        ).value
+    )
+
+    assert (
+        tenant_active_count == 1
+    ), f"Tenant {tenant_with_empty_timelines_dir} should have metric as active"
+
+    tenant_broken_count = int(
+        ps_metrics.query_one(
+            "pageserver_tenant_states_count", filter=broken_tenants_metric_filter
+        ).value
+    )
+
+    assert (
+        tenant_broken_count == 1
+    ), f"Tenant {tenant_without_timelines_dir} should have metric as broken"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index f4b71ae9b7..3a852b2207 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -20,7 +20,9 @@ clap = { version = "4", features = ["derive", "string"] }
 crossbeam-utils = { version = "0.8" }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
+futures = { version = "0.3" }
 futures-channel = { version = "0.3", features = ["sink"] }
+futures-executor = { version = "0.3" }
 futures-task = { version = "0.3", default-features = false, features = ["std"] }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
@@ -31,17 +33,21 @@ memchr = { version = "2" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
-num-traits = { version = "0.2", features = ["i128", "libm"] }
+num-traits = { version = "0.2", features = ["i128"] }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
 regex-syntax = { version = "0.6" }
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
+ring = { version = "0.16", features = ["std"] }
+rustls = { version = "0.20", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "sync", "time"] }
 tokio-util = { version = "0.7", features = ["codec", "io"] }
+tonic = { version = "0.8", features = ["tls-roots"] }
 tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }

From a41b5244a84116c16ace9143a02f8d21d218a84c Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 20 Feb 2023 18:22:49 +0300
Subject: [PATCH 36/63] Add new safekeeper to ap-southeast-1 prod (#3645)
 (#3646)

To trigger deployment of #3645 to production.
---
 .github/ansible/prod.ap-southeast-1.hosts.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml
index 7c6d1db6d7..71fced23c2 100644
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -36,3 +36,5 @@ storage:
           ansible_host:  i-0e338adda8eb2d19f
         safekeeper-2.ap-southeast-1.aws.neon.tech:
           ansible_host:  i-04fb63634e4679eb9
+        safekeeper-3.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-05481f3bc88cfc2d4

From 78aca668d08190934708f864fbddd14da316e8a8 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 21 Feb 2023 19:31:53 +0200
Subject: [PATCH 37/63] fix: log download failed error (#3661)

Fixes #3659
---
 pageserver/src/tenant/tasks.rs      | 2 +-
 pageserver/src/tenant/timeline.rs   | 1 +
 test_runner/regress/test_tenants.py | 4 ++++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index e9ce52d1ab..20d1d2bfb6 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -74,7 +74,7 @@ async fn compaction_loop(tenant_id: TenantId) {
             let period = tenant.get_compaction_period();
 
             // TODO: we shouldn't need to await to find tenant and this could be moved outside of
-            // loop
+            // loop, #3501. There are also additional "allowed_errors" in tests.
             if first {
                 first = false;
                 if random_init_delay(period, &cancel).await.is_err() {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f2b0a98509..8bc02cd10a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3745,6 +3745,7 @@ impl Timeline {
                     remote_layer.ongoing_download.close();
                 } else {
                     // Keep semaphore open. We'll drop the permit at the end of the function.
+                    info!("on-demand download failed: {:?}", result.as_ref().unwrap_err());
                 }
 
                 // Don't treat it as an error if the task that triggered the download
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index e56bb1b469..9e75396799 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -250,6 +250,10 @@ def test_pageserver_with_empty_tenants(
     env.pageserver.allowed_errors.append(
         ".*could not load tenant.*Failed to list timelines directory.*"
     )
+    # this is until #3501
+    env.pageserver.allowed_errors.append(
+        ".*Compaction failed, retrying in 2s: Cannot run compaction iteration on inactive tenant"
+    )
 
     client = env.pageserver.http_client()
 

From 15273a9b669c75b8a5bcda186715e7c1a940bfd6 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 21 Feb 2023 20:20:13 +0200
Subject: [PATCH 38/63] chore: ignore all compaction inactive tenant errors
 (#3665)

these are happening in tests because of #3655 but they sure took some
time to appear.

makes the `Compaction failed, retrying in 2s: Cannot run compaction
iteration on inactive tenant` into a globally allowed error, because it
has been seen failing on different test cases.
---
 test_runner/fixtures/neon_fixtures.py | 2 ++
 test_runner/regress/test_tenants.py   | 4 ----
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 63196609cc..73f224039e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2080,6 +2080,8 @@ class NeonPageserver(PgProtocol):
             ".*query handler for 'pagestream.*failed: Timeline .* was not found",  # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
             ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
             ".*task iteration took longer than the configured period.*",
+            # this is until #3501
+            ".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
         ]
 
     def start(
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 9e75396799..e56bb1b469 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -250,10 +250,6 @@ def test_pageserver_with_empty_tenants(
     env.pageserver.allowed_errors.append(
         ".*could not load tenant.*Failed to list timelines directory.*"
     )
-    # this is until #3501
-    env.pageserver.allowed_errors.append(
-        ".*Compaction failed, retrying in 2s: Cannot run compaction iteration on inactive tenant"
-    )
 
     client = env.pageserver.http_client()
 

From 43bf6d0a0f2695074f360bbc3b1deea2065f6321 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 21 Feb 2023 21:09:31 +0200
Subject: [PATCH 39/63] calculate_logical_size: no longer use spawn_blocking
 (#3664)

Calculation of logical size is now async because of layer downloads, so
we shouldn't use spawn_blocking for it. Use of `spawn_blocking`
exhausted resources which are needed by `tokio::io::copy` when copying
from a stream to a file which lead to deadlock.

Fixes: #3657
---
 pageserver/src/tenant/timeline.rs | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8bc02cd10a..176eb61ff3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1770,15 +1770,9 @@ impl Timeline {
         let calculation = async {
             let cancel = cancel.child_token();
             let ctx = ctx.attached_child();
-            tokio::task::spawn_blocking(move || {
-                // Run in a separate thread since this can do a lot of
-                // synchronous file IO without .await inbetween
-                // if there are no RemoteLayers that would require downloading.
-                let h = tokio::runtime::Handle::current();
-                h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel, &ctx))
-            })
-            .await
-            .context("Failed to spawn calculation result task")?
+            self_calculation
+                .calculate_logical_size(init_lsn, cancel, &ctx)
+                .await
         };
         let timeline_state_cancellation = async {
             loop {
@@ -1811,7 +1805,7 @@ impl Timeline {
         tokio::pin!(calculation);
         loop {
             tokio::select! {
-                res = &mut calculation =>  { return res }
+                res = &mut calculation => { return res }
                 reason = timeline_state_cancellation => {
                     debug!(reason = reason, "cancelling calculation");
                     cancel.cancel();

From a51b269f1502b69c5b2720d0a0e20b1e702ebd58 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 21 Feb 2023 21:14:08 +0200
Subject: [PATCH 40/63] fix: hold permit until GetObject eof (#3663)

previously we applied the ratelimiting only up to receiving the headers
from s3, or somewhere near it. the commit adds an adapter which carries
the permit until the AsyncRead has been disposed.

fixes #3662.
---
 Cargo.lock                           |  1 +
 libs/remote_storage/Cargo.toml       |  2 +-
 libs/remote_storage/src/s3_bucket.rs | 45 +++++++++++++++++++++++-----
 3 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d154b4eaea..dab3d12263 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3054,6 +3054,7 @@ dependencies = [
  "hyper",
  "metrics",
  "once_cell",
+ "pin-project-lite",
  "serde",
  "serde_json",
  "tempfile",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 4382fbac32..15812e8439 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -21,7 +21,7 @@ toml_edit.workspace = true
 tracing.workspace = true
 metrics.workspace = true
 utils.workspace = true
-
+pin-project-lite.workspace = true
 workspace_hack.workspace = true
 
 [dev-dependencies]
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 18a2c5dedd..93f5e0596e 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -20,7 +20,10 @@ use aws_sdk_s3::{
 };
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
-use tokio::{io, sync::Semaphore};
+use tokio::{
+    io::{self, AsyncRead},
+    sync::Semaphore,
+};
 use tokio_util::io::ReaderStream;
 use tracing::debug;
 
@@ -102,7 +105,7 @@ pub struct S3Bucket {
     // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
     // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
     // The helps to ensure we don't exceed the thresholds.
-    concurrency_limiter: Semaphore,
+    concurrency_limiter: Arc<Semaphore>,
 }
 
 #[derive(Default)]
@@ -162,7 +165,7 @@ impl S3Bucket {
             client,
             bucket_name: aws_config.bucket_name.clone(),
             prefix_in_bucket,
-            concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
+            concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
         })
     }
 
@@ -194,9 +197,10 @@ impl S3Bucket {
     }
 
     async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
-        let _guard = self
+        let permit = self
             .concurrency_limiter
-            .acquire()
+            .clone()
+            .acquire_owned()
             .await
             .context("Concurrency limiter semaphore got closed during S3 download")
             .map_err(DownloadError::Other)?;
@@ -217,9 +221,10 @@ impl S3Bucket {
                 let metadata = object_output.metadata().cloned().map(StorageMetadata);
                 Ok(Download {
                     metadata,
-                    download_stream: Box::pin(io::BufReader::new(
+                    download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
+                        permit,
                         object_output.body.into_async_read(),
-                    )),
+                    ))),
                 })
             }
             Err(SdkError::ServiceError {
@@ -240,6 +245,32 @@ impl S3Bucket {
     }
 }
 
+pin_project_lite::pin_project! {
+    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
+    struct RatelimitedAsyncRead<S> {
+        permit: tokio::sync::OwnedSemaphorePermit,
+        #[pin]
+        inner: S,
+    }
+}
+
+impl<S: AsyncRead> RatelimitedAsyncRead<S> {
+    fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
+        RatelimitedAsyncRead { permit, inner }
+    }
+}
+
+impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut io::ReadBuf<'_>,
+    ) -> std::task::Poll<std::io::Result<()>> {
+        let this = self.project();
+        this.inner.poll_read(cx, buf)
+    }
+}
+
 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
     async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {

From 38cd90dd0c85ff6286e689cae33a95fe095ffca0 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Tue, 21 Feb 2023 21:11:52 +0100
Subject: [PATCH 41/63] Add -v to ansible invocations (#3670)

To get more debug output on failures
---
 .github/workflows/deploy-dev.yml  | 2 +-
 .github/workflows/deploy-prod.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml
index 409517bf63..b080a29f7c 100644
--- a/.github/workflows/deploy-dev.yml
+++ b/.github/workflows/deploy-dev.yml
@@ -67,7 +67,7 @@ jobs:
           ./get_binaries.sh
 
           ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          ansible-playbook -v deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
           rm -f neon_install.tar.gz .neon_current_version
 
       - name: Cleanup ansible folder
diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml
index 540d187274..6096ac8ab9 100644
--- a/.github/workflows/deploy-prod.yml
+++ b/.github/workflows/deploy-prod.yml
@@ -68,7 +68,7 @@ jobs:
           ./get_binaries.sh
 
           ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          ansible-playbook -v deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
           rm -f neon_install.tar.gz .neon_current_version
 
   deploy-proxy-prod-new:

From 46cc8b7982b270796d6266861801b2e466319968 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 22 Feb 2023 12:55:41 +0300
Subject: [PATCH 42/63] Remove safekeeper-1.ap-southeast-1.aws.neon.tech
 (#3671)

We migrated all timelines to
`safekeeper-3.ap-southeast-1.aws.neon.tech`, now old instance can be
removed.
---
 .github/ansible/prod.ap-southeast-1.hosts.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml
index 13b44f4052..8ccb67b04a 100644
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -32,8 +32,6 @@ storage:
       hosts:
         safekeeper-0.ap-southeast-1.aws.neon.tech:
           ansible_host:  i-0d6f1dc5161eef894
-        safekeeper-1.ap-southeast-1.aws.neon.tech:
-          ansible_host:  i-0e338adda8eb2d19f
         safekeeper-2.ap-southeast-1.aws.neon.tech:
           ansible_host:  i-04fb63634e4679eb9
         safekeeper-3.ap-southeast-1.aws.neon.tech:

From efef68ce9972ba47ce7890e386d1b066e67f695d Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 22 Feb 2023 22:52:22 +0400
Subject: [PATCH 43/63] Bump vendor/postgres to include hotfix for unlogged
 tables with indexes.

https://github.com/neondatabase/postgres/pull/259
https://github.com/neondatabase/postgres/pull/262
---
 vendor/postgres-v14 | 2 +-
 vendor/postgres-v15 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index f210ac524b..b44ee1d9a5 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit f210ac524b42d2d6f404f8505c64de36e977d17c
+Subproject commit b44ee1d9a5b061ababb31f89a4e30a1795573f51
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 33f9763454..303fa4050f 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 33f976345490351f951d72f81621c2263c186c9a
+Subproject commit 303fa4050fafba3771052b3d49b8e2d00d6ea2e3

From 91a4ea0de2a69b6003c6d224b6c96789ce1a4513 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Thu, 23 Feb 2023 15:10:22 +0100
Subject: [PATCH 44/63] Update vendored PostgreSQL versions to 14.7 and 15.2
 (#3581)

## Describe your changes
Rebase vendored PostgreSQL onto 14.7 and 15.2

## Issue ticket number and link

#3579

## Checklist before requesting a review
- [x] I have performed a self-review of my code.
- [x] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [x] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
    ```
The version of PostgreSQL that we use is updated to 14.7 for PostgreSQL
14 and 15.2 for PostgreSQL 15.
    ```
---
 test_runner/fixtures/neon_fixtures.py   |  10 +-
 test_runner/regress/test_tenant_size.py | 164 ++++++++++++++++--------
 vendor/postgres-v14                     |   2 +-
 vendor/postgres-v15                     |   2 +-
 4 files changed, 120 insertions(+), 58 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 73f224039e..c4b3d057f8 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1669,7 +1669,7 @@ class AbstractNeonCli(abc.ABC):
             timeout=timeout,
         )
         if not res.returncode:
-            log.info(f"Run success: {res.stdout}")
+            log.info(f"Run {res.args} success: {res.stdout}")
         elif check_return_code:
             # this way command output will be in recorded and shown in CI in failure message
             msg = f"""\
@@ -3463,6 +3463,14 @@ def wait_for_last_flush_lsn(
     return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn)
 
 
+def wait_for_wal_insert_lsn(
+    env: NeonEnv, pg: Postgres, tenant: TenantId, timeline: TimelineId
+) -> Lsn:
+    """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
+    last_flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0])
+    return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn)
+
+
 def fork_at_current_lsn(
     env: NeonEnv,
     pg: Postgres,
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 8c2996f491..a4b5f7739a 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -3,8 +3,15 @@ from typing import List, Tuple
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn
-from fixtures.types import Lsn
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PageserverHttpClient,
+    Postgres,
+    wait_for_last_flush_lsn,
+    wait_for_wal_insert_lsn,
+)
+from fixtures.types import Lsn, TenantId, TimelineId
 
 
 def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
@@ -324,7 +331,7 @@ def test_single_branch_get_tenant_size_grows(
     # inserts is larger than gc_horizon. for example 0x20000 here hid the fact
     # that there next_gc_cutoff could be smaller than initdb_lsn, which will
     # obviously lead to issues when calculating the size.
-    gc_horizon = 0x30000
+    gc_horizon = 0x38000
     neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
 
     env = neon_env_builder.init_start()
@@ -334,29 +341,75 @@ def test_single_branch_get_tenant_size_grows(
 
     http_client = env.pageserver.http_client()
 
-    collected_responses: List[Tuple[Lsn, int]] = []
+    collected_responses: List[Tuple[str, Lsn, int]] = []
 
     size_debug_file = open(test_output_dir / "size_debug.html", "w")
 
-    def check_size_change(current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev: int):
-        if current_lsn - initdb_lsn > gc_horizon:
+    def check_size_change(
+        current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev_size: int
+    ):
+        if current_lsn - initdb_lsn >= gc_horizon:
             assert (
-                size >= prev
+                size >= prev_size
             ), "tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size"
         else:
             assert (
-                size > prev
+                size > prev_size
             ), "tenant_size should grow, because we continue to add WAL to initial snapshot size"
 
-    with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg:
-        initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    def get_current_consistent_size(
+        env: NeonEnv,
+        pg: Postgres,
+        size_debug_file,  # apparently there is no public signature for open()...
+        http_client: PageserverHttpClient,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Tuple[Lsn, int]:
+        consistent = False
+        size_debug = None
+
+        current_lsn = wait_for_wal_insert_lsn(env, pg, tenant_id, timeline_id)
+        # We want to make sure we have a self-consistent set of values.
+        # Size changes with WAL, so only if both before and after getting
+        # the size of the tenant reports the same WAL insert LSN, we're OK
+        # to use that (size, LSN) combination.
+        # Note that 'wait_for_wal_flush_lsn' is not accurate enough: There
+        # can be more wal after the flush LSN that can arrive on the
+        # pageserver before we're requesting the page size.
+        # Anyway, in general this is only one iteration, so in general
+        # this is fine.
+        while not consistent:
+            size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
+            size_debug = http_client.tenant_size_debug(tenant_id)
+
+            after_lsn = wait_for_wal_insert_lsn(env, pg, tenant_id, timeline_id)
+            consistent = current_lsn == after_lsn
+            current_lsn = after_lsn
+        size_debug_file.write(size_debug)
+        return (current_lsn, size)
+
+    with env.postgres.create_start(
+        branch_name,
+        tenant_id=tenant_id,
+        ### autovacuum is disabled to limit WAL logging.
+        config_lines=["autovacuum=off"],
+    ) as pg:
+        (initdb_lsn, size) = get_current_consistent_size(
+            env, pg, size_debug_file, http_client, tenant_id, timeline_id
+        )
+        collected_responses.append(("INITDB", initdb_lsn, size))
+
         with pg.cursor() as cur:
-            cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL)")
+            cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL) WITH (fillfactor = 40)")
+
+        (current_lsn, size) = get_current_consistent_size(
+            env, pg, size_debug_file, http_client, tenant_id, timeline_id
+        )
+        collected_responses.append(("CREATE", current_lsn, size))
 
         batch_size = 100
 
-        i = 0
-        while True:
+        for i in range(3):
             with pg.cursor() as cur:
                 cur.execute(
                     f"INSERT INTO t0(i) SELECT i FROM generate_series({batch_size} * %s, ({batch_size} * (%s + 1)) - 1) s(i)",
@@ -365,27 +418,24 @@ def test_single_branch_get_tenant_size_grows(
 
             i += 1
 
-            current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+            (current_lsn, size) = get_current_consistent_size(
+                env, pg, size_debug_file, http_client, tenant_id, timeline_id
+            )
 
-            size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
+            prev_size = collected_responses[-1][2]
+            if size == 0:
+                assert prev_size == 0
+            else:
+                # branch start shouldn't be past gc_horizon yet
+                # thus the size should grow as we insert more data
+                # "gc_horizon" is tuned so that it kicks in _after_ the
+                # insert phase, but before the update phase ends.
+                assert (
+                    current_lsn - initdb_lsn <= gc_horizon
+                ), "Tuning of GC window is likely out-of-date"
+                assert size > prev_size
 
-            size_debug = http_client.tenant_size_debug(tenant_id)
-            size_debug_file.write(size_debug)
-
-            if len(collected_responses) > 0:
-                prev = collected_responses[-1][1]
-                if size == 0:
-                    assert prev == 0
-                else:
-                    # branch start shouldn't be past gc_horizon yet
-                    # thus the size should grow as we insert more data
-                    assert current_lsn - initdb_lsn <= gc_horizon
-                    assert size > prev
-
-            collected_responses.append((current_lsn, size))
-
-            if len(collected_responses) > 2:
-                break
+            collected_responses.append(("INSERT", current_lsn, size))
 
         while True:
             with pg.cursor() as cur:
@@ -397,18 +447,15 @@ def test_single_branch_get_tenant_size_grows(
             if updated == 0:
                 break
 
-            current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+            (current_lsn, size) = get_current_consistent_size(
+                env, pg, size_debug_file, http_client, tenant_id, timeline_id
+            )
 
-            size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
+            prev_size = collected_responses[-1][2]
 
-            size_debug = http_client.tenant_size_debug(tenant_id)
-            size_debug_file.write(size_debug)
+            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
 
-            prev = collected_responses[-1][1]
-
-            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
-
-            collected_responses.append((current_lsn, size))
+            collected_responses.append(("UPDATE", current_lsn, size))
 
         while True:
             with pg.cursor() as cur:
@@ -418,40 +465,47 @@ def test_single_branch_get_tenant_size_grows(
             if deleted == 0:
                 break
 
-            current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+            (current_lsn, size) = get_current_consistent_size(
+                env, pg, size_debug_file, http_client, tenant_id, timeline_id
+            )
 
-            size = http_client.tenant_size(tenant_id)
-            prev = collected_responses[-1][1]
+            prev_size = collected_responses[-1][2]
 
-            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
+            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
 
-            collected_responses.append((current_lsn, size))
+            collected_responses.append(("DELETE", current_lsn, size))
 
         with pg.cursor() as cur:
             cur.execute("DROP TABLE t0")
 
-        current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+        # The size of the tenant should still be as large as before we dropped
+        # the table, because the drop operation can still be undone in the PITR
+        # defined by gc_horizon.
+        (current_lsn, size) = get_current_consistent_size(
+            env, pg, size_debug_file, http_client, tenant_id, timeline_id
+        )
 
-        size = http_client.tenant_size(tenant_id)
-        prev = collected_responses[-1][1]
+        prev_size = collected_responses[-1][2]
 
-        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
+        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
 
-        collected_responses.append((current_lsn, size))
+        collected_responses.append(("DROP", current_lsn, size))
 
     # this isn't too many lines to forget for a while. observed while
     # developing these tests that locally the value is a bit more than what we
     # get in the ci.
-    for lsn, size in collected_responses:
-        log.info(f"collected: {lsn}, {size}")
+    for phase, lsn, size in collected_responses:
+        log.info(f"collected: {phase}, {lsn}, {size}")
 
     env.pageserver.stop()
     env.pageserver.start()
 
+    size_after = http_client.tenant_size(tenant_id)
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
     size_debug_file.close()
 
-    size_after = http_client.tenant_size(tenant_id)
-    prev = collected_responses[-1][1]
+    prev = collected_responses[-1][2]
 
     assert size_after == prev, "size after restarting pageserver should not have changed"
 
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index b44ee1d9a5..468d3c0824 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit b44ee1d9a5b061ababb31f89a4e30a1795573f51
+Subproject commit 468d3c08245906f083fed1009759f9f953f5915d
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 303fa4050f..9a2093383a 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 303fa4050fafba3771052b3d49b8e2d00d6ea2e3
+Subproject commit 9a2093383ae19906f025b008ceecf89ebc9ea869

From 240913912a4d9450ac3c67810b60f1df7e65afb4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 24 Feb 2023 13:45:32 +0200
Subject: [PATCH 45/63] Fix UNLOGGED tables.

Instead of trying to create missing files on the way, send init fork contents as
main fork from pageserver during basebackup. Add test for that. Call
put_rel_drop for init forks; previously they weren't removed. Bump
vendor/postgres to revert previous approach on Postgres side.

Co-authored-by: Arseny Sher <sher-ars@yandex.ru>

ref https://github.com/neondatabase/postgres/pull/264
ref https://github.com/neondatabase/postgres/pull/259
ref https://github.com/neondatabase/neon/issues/1222
---
 libs/pageserver_api/src/reltag.rs    |  9 ++++++
 pageserver/src/basebackup.rs         | 45 ++++++++++++++++++++--------
 pageserver/src/walingest.rs          |  4 +--
 test_runner/regress/test_unlogged.py | 34 +++++++++++++++++++++
 vendor/postgres-v14                  |  2 +-
 vendor/postgres-v15                  |  2 +-
 6 files changed, 79 insertions(+), 17 deletions(-)
 create mode 100644 test_runner/regress/test_unlogged.py

diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs
index 43d38bd986..12693379f5 100644
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -98,6 +98,15 @@ impl RelTag {
 
         name
     }
+
+    pub fn with_forknum(&self, forknum: u8) -> Self {
+        RelTag {
+            forknum,
+            spcnode: self.spcnode,
+            dbnode: self.dbnode,
+            relnode: self.relnode,
+        }
+    }
 }
 
 ///
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 06d4853274..41fa0a67bb 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -33,6 +33,7 @@ use pageserver_api::reltag::{RelTag, SlruKind};
 
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
 use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
+use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
 use postgres_ffi::TransactionId;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::PG_TLI;
@@ -190,14 +191,31 @@ where
         {
             self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
 
-            // Gather and send relational files in each database if full backup is requested.
-            if self.full_backup {
-                for rel in self
-                    .timeline
-                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
-                    .await?
-                {
-                    self.add_rel(rel).await?;
+            // If full backup is requested, include all relation files.
+            // Otherwise only include init forks of unlogged relations.
+            let rels = self
+                .timeline
+                .list_rels(spcnode, dbnode, self.lsn, self.ctx)
+                .await?;
+            for &rel in rels.iter() {
+                // Send init fork as main fork to provide well formed empty
+                // contents of UNLOGGED relations. Postgres copies it in
+                // `reinit.c` during recovery.
+                if rel.forknum == INIT_FORKNUM {
+                    // I doubt we need _init fork itself, but having it at least
+                    // serves as a marker relation is unlogged.
+                    self.add_rel(rel, rel).await?;
+                    self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
+                    continue;
+                }
+
+                if self.full_backup {
+                    if rel.forknum == MAIN_FORKNUM && rels.contains(&rel.with_forknum(INIT_FORKNUM))
+                    {
+                        // skip this, will include it when we reach the init fork
+                        continue;
+                    }
+                    self.add_rel(rel, rel).await?;
                 }
             }
         }
@@ -220,15 +238,16 @@ where
         Ok(())
     }
 
-    async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
+    /// Add contents of relfilenode `src`, naming it as `dst`.
+    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
         let nblocks = self
             .timeline
-            .get_rel_size(tag, self.lsn, false, self.ctx)
+            .get_rel_size(src, self.lsn, false, self.ctx)
             .await?;
 
         // If the relation is empty, create an empty file
         if nblocks == 0 {
-            let file_name = tag.to_segfile_name(0);
+            let file_name = dst.to_segfile_name(0);
             let header = new_tar_header(&file_name, 0)?;
             self.ar.append(&header, &mut io::empty()).await?;
             return Ok(());
@@ -244,12 +263,12 @@ where
             for blknum in startblk..endblk {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false, self.ctx)
+                    .get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
                     .await?;
                 segment_data.extend_from_slice(&img[..]);
             }
 
-            let file_name = tag.to_segfile_name(seg as u32);
+            let file_name = dst.to_segfile_name(seg as u32);
             let header = new_tar_header(&file_name, segment_data.len() as u64)?;
             self.ar.append(&header, segment_data.as_slice()).await?;
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 3761c65668..63d568a342 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -37,7 +37,7 @@ use crate::walrecord::*;
 use crate::ZERO_PAGE;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
-use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
 use postgres_ffi::v14::xlog_utils::*;
 use postgres_ffi::v14::CheckPoint;
@@ -762,7 +762,7 @@ impl<'a> WalIngest<'a> {
         )?;
 
         for xnode in &parsed.xnodes {
-            for forknum in MAIN_FORKNUM..=VISIBILITYMAP_FORKNUM {
+            for forknum in MAIN_FORKNUM..=INIT_FORKNUM {
                 let rel = RelTag {
                     forknum,
                     spcnode: xnode.spcnode,
diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py
new file mode 100644
index 0000000000..b6b20f1230
--- /dev/null
+++ b/test_runner/regress/test_unlogged.py
@@ -0,0 +1,34 @@
+from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
+
+
+#
+# Test UNLOGGED tables/relations. Postgres copies init fork contents to main
+# fork to reset them during recovery. In Neon, pageserver directly sends init
+# fork contents as main fork during basebackup.
+#
+def test_unlogged(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_unlogged", "empty")
+    pg = env.postgres.create_start("test_unlogged")
+
+    conn = pg.connect()
+    cur = conn.cursor()
+
+    cur.execute("CREATE UNLOGGED TABLE iut (id int);")
+    # create index to test unlogged index relation as well
+    cur.execute("CREATE UNIQUE INDEX iut_idx ON iut (id);")
+    cur.execute("INSERT INTO iut values (42);")
+
+    # create another compute to fetch inital empty contents from pageserver
+    fork_at_current_lsn(env, pg, "test_unlogged_basebackup", "test_unlogged")
+    pg2 = env.postgres.create_start(
+        "test_unlogged_basebackup",
+    )
+
+    conn2 = pg2.connect()
+    cur2 = conn2.cursor()
+    # after restart table should be empty but valid
+    cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut VALUES ($1)")
+    cur2.execute("EXECUTE iut_plan (43);")
+    cur2.execute("SELECT * FROM iut")
+    assert cur2.fetchall() == [(43,)]
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 468d3c0824..5fb2e0bba0 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 468d3c08245906f083fed1009759f9f953f5915d
+Subproject commit 5fb2e0bba06cc018ee2506f337c91751ab695454
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 9a2093383a..919851e781 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 9a2093383ae19906f025b008ceecf89ebc9ea869
+Subproject commit 919851e7811fcb2ecfc67f35bfd63a35639c73b5

From 99752286d836653e3af3bc798871394970af8db0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= <lassi.polonen@iki.fi>
Date: Tue, 14 Mar 2023 15:23:46 +0200
Subject: [PATCH 46/63] Use RollingUpdate strategy also for legacy proxy
 (#3814)

## Describe your changes
We have previously changed the neon-proxy to use RollingUpdate. This
should be enabled in legacy proxy too in order to avoid breaking
connections for the clients and allow for example backups to run even
during deployment. (https://github.com/neondatabase/neon/pull/3683)

## Issue ticket number and link
https://github.com/neondatabase/neon/issues/3333
---
 ...od-us-west-2-eta.neon-proxy-scram-legacy.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
index e67a3e4461..d23ea41bd7 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
@@ -1,6 +1,22 @@
 # Helm chart values for neon-proxy-scram.
 # This is a YAML-formatted file.
 
+deploymentStrategy:
+  type: RollingUpdate
+  rollingUpdate:
+    maxSurge: 100%
+    maxUnavailable: 50%
+
+# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
+# The pod(s) will stay in Terminating, keeps the existing connections
+# but doesn't receive new ones
+containerLifecycle:
+  preStop:
+    exec:
+      command: ["/bin/sh", "-c", "sleep 604800"]
+terminationGracePeriodSeconds: 604800
+
+
 image:
   repository: neondatabase/neon
 

From afd0a6b39aa10392a74ab1200fe4765936b75ce0 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 15 Mar 2023 11:44:55 +0400
Subject: [PATCH 47/63] Forward framed read buf contents to compute before
 proxy pass.

Otherwise they get lost. Normally buffer is empty before proxy pass, but this is
not the case with pipeline mode of out npm driver; fixes connection hangup
introduced by b80fe41af3e for it.

fixes https://github.com/neondatabase/neon/issues/3822
---
 libs/pq_proto/src/framed.rs |  6 +++---
 proxy/src/proxy.rs          | 27 +++++++++++++++++++++------
 proxy/src/stream.rs         |  5 +++--
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs
index 972730cbab..3cdca45009 100644
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -63,9 +63,9 @@ impl<S> Framed<S> {
         &self.stream
     }
 
-    /// Extract the underlying stream.
-    pub fn into_inner(self) -> S {
-        self.stream
+    /// Deconstruct into the underlying stream and read buffer.
+    pub fn into_inner(self) -> (S, BytesMut) {
+        (self.stream, self.read_buf)
     }
 
     /// Return new Framed with stream type transformed by async f, for TLS
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index abeff6a33b..efe0e8795b 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -16,7 +16,7 @@ use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCou
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use std::sync::Arc;
-use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tracing::{error, info, warn};
 use utils::measured_stream::MeasuredStream;
 
@@ -209,9 +209,18 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                     if let Some(tls) = tls.take() {
                         // Upgrade raw stream into a secure TLS-backed stream.
                         // NOTE: We've consumed `tls`; this fact will be used later.
-                        stream = PqStream::new(
-                            stream.into_inner().upgrade(tls.to_server_config()).await?,
-                        );
+
+                        let (raw, read_buf) = stream.into_inner();
+                        // TODO: Normally, client doesn't send any data before
+                        // server says TLS handshake is ok and read_buf is empy.
+                        // However, you could imagine pipelining of postgres
+                        // SSLRequest + TLS ClientHello in one hunk similar to
+                        // pipelining in our node js driver. We should probably
+                        // support that by chaining read_buf with the stream.
+                        if !read_buf.is_empty() {
+                            bail!("data is sent before server replied with EncryptionResponse");
+                        }
+                        stream = PqStream::new(raw.upgrade(tls.to_server_config()).await?);
                     }
                 }
                 _ => bail!(ERR_PROTO_VIOLATION),
@@ -443,11 +452,17 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
             value: mut node_info,
         } = auth_result;
 
-        let node = connect_to_compute(&mut node_info, params, &extra, &creds)
+        let mut node = connect_to_compute(&mut node_info, params, &extra, &creds)
             .or_else(|e| stream.throw_error(e))
             .await?;
 
         prepare_client_connection(&node, reported_auth_ok, session, &mut stream).await?;
-        proxy_pass(stream.into_inner(), node.stream, &node_info.aux).await
+        // Before proxy passing, forward to compute whatever data is left in the
+        // PqStream input buffer. Normally there is none, but our serverless npm
+        // driver in pipeline mode sends startup, password and first query
+        // immediately after opening the connection.
+        let (stream, read_buf) = stream.into_inner();
+        node.stream.write_all(&read_buf).await?;
+        proxy_pass(stream, node.stream, &node_info.aux).await
     }
 }
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 9dfc435e39..7cb292ed58 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -1,5 +1,6 @@
 use crate::error::UserFacingError;
 use anyhow::bail;
+use bytes::BytesMut;
 use pin_project_lite::pin_project;
 use pq_proto::framed::{ConnectionError, Framed};
 use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError};
@@ -27,8 +28,8 @@ impl<S> PqStream<S> {
         }
     }
 
-    /// Extract the underlying stream.
-    pub fn into_inner(self) -> S {
+    /// Extract the underlying stream and read buffer.
+    pub fn into_inner(self) -> (S, BytesMut) {
         self.framed.into_inner()
     }
 

From 4ed51ad33b12edfec2e03b47f8fcfb7e7b8173cb Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Tue, 11 Apr 2023 12:50:10 +0300
Subject: [PATCH 48/63] Add more proxy cnames

---
 .../prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml           | 2 +-
 .../helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml   | 2 +-
 .github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml  | 2 +-
 .github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml    | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
index 36dac8309d..5a98217bae 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -24,7 +24,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.ap-southeast-1.aws.neon.tech"
-  extraDomains: ["*.ap-southeast-1.retooldb.com"]
+  extraDomains: ["*.ap-southeast-1.retooldb.com", "*.ap-southeast-1.postgres.vercel-storage.com"]
   sentryEnvironment: "production"
   wssPort: 8443
   metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
index f5b2f31cb9..a9ee49d82f 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -24,7 +24,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.eu-central-1.aws.neon.tech"
-  extraDomains: ["*.eu-central-1.retooldb.com"]
+  extraDomains: ["*.eu-central-1.retooldb.com", "*.eu-central-1.postgres.vercel-storage.com"]
   sentryEnvironment: "production"
   wssPort: 8443
   metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
index 0be78d868a..239a9911c7 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -24,7 +24,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.us-east-2.aws.neon.tech"
-  extraDomains: ["*.us-east-2.retooldb.com"]
+  extraDomains: ["*.us-east-2.retooldb.com", "*.us-east-2.postgres.vercel-storage.com"]
   sentryEnvironment: "production"
   wssPort: 8443
   metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
index 79115be0e2..c987ae236a 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -24,7 +24,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.us-west-2.aws.neon.tech"
-  extraDomains: ["*.us-west-2.retooldb.com"]
+  extraDomains: ["*.us-west-2.retooldb.com", "*.us-west-2.postgres.vercel-storage.com"]
   sentryEnvironment: "production"
   wssPort: 8443
   metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"

From d11d781afc300b1717050cad38ff02e21aa2dc02 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 28 Apr 2023 17:20:18 +0300
Subject: [PATCH 49/63] revert: "Add check for duplicates of generated image
 layers" (#4104)

This reverts commit 732acc5.

Reverted PR: #3869

As noted in PR #4094, we do in fact try to insert duplicates to the
layer map, if L0->L1 compaction is interrupted. We do not have a proper
fix for that right now, and we are in a hurry to make a release to
production, so revert the changes related to this to the state that we
have in production currently. We know that we have a bug here, but
better to live with the bug that we've had in production for a long
time, than rush a fix to production without testing it in staging first.

Cc: #4094, #4088
---
 pageserver/benches/bench_layer_map.rs         |  4 +--
 pageserver/src/tenant.rs                      |  7 +-----
 pageserver/src/tenant/layer_map.rs            | 23 +++++++----------
 .../layer_map/historic_layer_coverage.rs      |  8 ------
 pageserver/src/tenant/timeline.rs             | 25 +++++++------------
 5 files changed, 21 insertions(+), 46 deletions(-)

diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 8f139a6596..ee5980212e 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -33,7 +33,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
         min_lsn = min(min_lsn, lsn_range.start);
         max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
 
-        updates.insert_historic(Arc::new(layer)).unwrap();
+        updates.insert_historic(Arc::new(layer));
     }
 
     println!("min: {min_lsn}, max: {max_lsn}");
@@ -215,7 +215,7 @@ fn bench_sequential(c: &mut Criterion) {
             is_incremental: false,
             short_id: format!("Layer {}", i),
         };
-        updates.insert_historic(Arc::new(layer)).unwrap();
+        updates.insert_historic(Arc::new(layer));
     }
     updates.flush();
     println!("Finished layer map init in {:?}", now.elapsed());
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d69d5e4b45..5cfc466111 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -271,10 +271,7 @@ impl UninitializedTimeline<'_> {
             .await
             .context("Failed to flush after basebackup import")?;
 
-        // Initialize without loading the layer map. We started with an empty layer map, and already
-        // updated it for the layers that we created during the import.
-        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
-        self.initialize_with_lock(ctx, &mut timelines, false, true)
+        self.initialize(ctx)
     }
 
     fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
@@ -2355,8 +2352,6 @@ impl Tenant {
                 )
             })?;
 
-        // Initialize the timeline without loading the layer map, because we already updated the layer
-        // map above, when we imported the datadir.
         let timeline = {
             let mut timelines = self.timelines.lock().unwrap();
             raw_timeline.initialize_with_lock(ctx, &mut timelines, false, true)?
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 0ee0c6f77d..8d06ccd565 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,7 +51,7 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use crate::tenant::storage_layer::Layer;
-use anyhow::{bail, Result};
+use anyhow::Result;
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
@@ -125,7 +125,7 @@ where
     ///
     /// Insert an on-disk layer.
     ///
-    pub fn insert_historic(&mut self, layer: Arc<L>) -> anyhow::Result<()> {
+    pub fn insert_historic(&mut self, layer: Arc<L>) {
         self.layer_map.insert_historic_noflush(layer)
     }
 
@@ -273,21 +273,16 @@ where
     ///
     /// Helper function for BatchedUpdates::insert_historic
     ///
-    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) -> anyhow::Result<()> {
-        let key = historic_layer_coverage::LayerKey::from(&*layer);
-        if self.historic.contains(&key) {
-            bail!(
-                "Attempt to insert duplicate layer {} in layer map",
-                layer.short_id()
-            );
-        }
-        self.historic.insert(key, Arc::clone(&layer));
+    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
+        // TODO: See #3869, resulting #4088, attempted fix and repro #4094
+        self.historic.insert(
+            historic_layer_coverage::LayerKey::from(&*layer),
+            Arc::clone(&layer),
+        );
 
         if Self::is_l0(&layer) {
             self.l0_delta_layers.push(layer);
         }
-
-        Ok(())
     }
 
     ///
@@ -839,7 +834,7 @@ mod tests {
 
             let expected_in_counts = (1, usize::from(expected_l0));
 
-            map.batch_update().insert_historic(remote.clone()).unwrap();
+            map.batch_update().insert_historic(remote.clone());
             assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
 
             let replaced = map
diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
index 1fdcd5e5a4..b63c361314 100644
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -417,14 +417,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
         }
     }
 
-    pub fn contains(&self, layer_key: &LayerKey) -> bool {
-        match self.buffer.get(layer_key) {
-            Some(None) => false,                         // layer remove was buffered
-            Some(_) => true,                             // layer insert was buffered
-            None => self.layers.contains_key(layer_key), // no buffered ops for this layer
-        }
-    }
-
     pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
         self.buffer.insert(layer_key, Some(value));
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5c671ffd63..8768841d87 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1484,7 +1484,7 @@ impl Timeline {
 
                 trace!("found layer {}", layer.path().display());
                 total_physical_size += file_size;
-                updates.insert_historic(Arc::new(layer))?;
+                updates.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
                 // Create a DeltaLayer struct for each delta file.
@@ -1516,7 +1516,7 @@ impl Timeline {
 
                 trace!("found layer {}", layer.path().display());
                 total_physical_size += file_size;
-                updates.insert_historic(Arc::new(layer))?;
+                updates.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                 // ignore these
@@ -1590,7 +1590,7 @@ impl Timeline {
             // remote index file?
             // If so, rename_to_backup those files & replace their local layer with
             // a RemoteLayer in the layer map so that we re-download them on-demand.
-            if let Some(local_layer) = &local_layer {
+            if let Some(local_layer) = local_layer {
                 let local_layer_path = local_layer
                     .local_path()
                     .expect("caller must ensure that local_layers only contains local layers");
@@ -1615,6 +1615,7 @@ impl Timeline {
                         anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
                     } else {
                         self.metrics.resident_physical_size_gauge.sub(local_size);
+                        updates.remove_historic(local_layer);
                         // fall-through to adding the remote layer
                     }
                 } else {
@@ -1650,11 +1651,7 @@ impl Timeline {
                     );
                     let remote_layer = Arc::new(remote_layer);
 
-                    if let Some(local_layer) = &local_layer {
-                        updates.replace_historic(local_layer, remote_layer)?;
-                    } else {
-                        updates.insert_historic(remote_layer)?;
-                    }
+                    updates.insert_historic(remote_layer);
                 }
                 LayerFileName::Delta(deltafilename) => {
                     // Create a RemoteLayer for the delta file.
@@ -1678,11 +1675,7 @@ impl Timeline {
                         LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted),
                     );
                     let remote_layer = Arc::new(remote_layer);
-                    if let Some(local_layer) = &local_layer {
-                        updates.replace_historic(local_layer, remote_layer)?;
-                    } else {
-                        updates.insert_historic(remote_layer)?;
-                    }
+                    updates.insert_historic(remote_layer);
                 }
             }
         }
@@ -2730,7 +2723,7 @@ impl Timeline {
             .write()
             .unwrap()
             .batch_update()
-            .insert_historic(Arc::new(new_delta))?;
+            .insert_historic(Arc::new(new_delta));
 
         // update the timeline's physical size
         let sz = new_delta_path.metadata()?.len();
@@ -2935,7 +2928,7 @@ impl Timeline {
             self.metrics
                 .resident_physical_size_gauge
                 .add(metadata.len());
-            updates.insert_historic(Arc::new(l))?;
+            updates.insert_historic(Arc::new(l));
         }
         updates.flush();
         drop(layers);
@@ -3368,7 +3361,7 @@ impl Timeline {
 
             new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
             let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
-            updates.insert_historic(x)?;
+            updates.insert_historic(x);
         }
 
         // Now that we have reshuffled the data to set of new delta layers, we can

From 4c3ba1627b26178917880f23c4c9eede98960626 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Fri, 28 Apr 2023 20:16:02 +0300
Subject: [PATCH 50/63] Add 4 new Pageservers for retool launch

---
 .github/ansible/prod.us-west-2.hosts.yaml | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index 9cf847bcb1..1fde83520e 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -41,6 +41,14 @@ storage:
           ansible_host: i-051642d372c0a4f32
         pageserver-3.us-west-2.aws.neon.tech:
           ansible_host: i-00c3844beb9ad1c6b
+        pageserver-4.us-west-2.aws.neon.tech:
+          ansible_host: i-013263dd1c239adcc
+        pageserver-5.us-west-2.aws.neon.tech:
+          ansible_host: i-00ca6417c7bf96820
+        pageserver-6.us-west-2.aws.neon.tech:
+          ansible_host: i-01cdf7d2bc1433b6a
+        pageserver-7.us-west-2.aws.neon.tech:
+          ansible_host: i-02eec9b40617db5bc
 
     safekeepers:
       hosts:
@@ -50,4 +58,3 @@ storage:
           ansible_host: i-074682f9d3c712e7c
         safekeeper-2.us-west-2.aws.neon.tech:
           ansible_host: i-042b7efb1729d7966
-

From ff1119da6678d589026d3a74004f2dfe53411abe Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 1 May 2023 13:33:10 +0300
Subject: [PATCH 51/63] Add 2 new sets of safekeepers to us-west2

---
 .github/ansible/prod.us-west-2.hosts.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index 1fde83520e..be65d8e63c 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -58,3 +58,15 @@ storage:
           ansible_host: i-074682f9d3c712e7c
         safekeeper-2.us-west-2.aws.neon.tech:
           ansible_host: i-042b7efb1729d7966
+        safekeeper-3.us-west-2.aws.neon.tech:
+          ansible_host: i-089f6b9ef426dff76
+        safekeeper-4.us-west-2.aws.neon.tech:
+          ansible_host: i-0fe6bf912c4710c82
+        safekeeper-5.us-west-2.aws.neon.tech:
+          ansible_host: i-0a83c1c46d2b4e409
+        safekeeper-6.us-west-2.aws.neon.tech:
+          ansible_host: i-0fef5317b8fdc9f8d
+        safekeeper-7.us-west-2.aws.neon.tech:
+          ansible_host: i-0be739190d4289bf9
+        safekeeper-8.us-west-2.aws.neon.tech:
+          ansible_host: i-00e851803669e5cfe                    

From 840183e51fde41b6f3019d33655d9974d2f6880e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 11 May 2023 15:08:39 +0300
Subject: [PATCH 52/63] try: higher page_service timeouts to isolate an issue

---
 pageserver/src/page_service.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index a7a0d1a22e..bd3ece2dfc 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -256,7 +256,10 @@ async fn page_service_conn_main(
     //
     // no write timeout is used, because the kernel is assumed to error writes after some time.
     let mut socket = tokio_io_timeout::TimeoutReader::new(socket);
-    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 10)));
+
+    // timeout should be lower, but trying out multiple days for
+    // <https://github.com/neondatabase/neon/issues/4205>
+    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
     let socket = std::pin::pin!(socket);
 
     // XXX: pgbackend.run() should take the connection_ctx,

From 85d6194aa40671566f349283f63a692d1861c640 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 16 May 2023 17:19:12 +0100
Subject: [PATCH 53/63] Fix regress-tests job for Postgres 15 on release branch
 (#4254)

## Problem

Compatibility tests don't support Postgres 15 yet, but we're still
trying to upload compatibility snapshot (which we do not collect).

Ref
https://github.com/neondatabase/neon/actions/runs/4991394158/jobs/8940369368#step:4:38129

## Summary of changes

Add `pg_version` parameter to `run-python-test-set` actions and do not
upload compatibility snapshot for Postgres 15
---
 .github/actions/run-python-test-set/action.yml | 11 ++++++++---
 .github/workflows/build_and_test.yml           |  2 +-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index d6c960bfda..bb120e9470 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -48,6 +48,10 @@ inputs:
     description: 'Whether to rerun flaky tests'
     required: false
     default: 'false'
+  pg_version:
+    description: 'Postgres version to use for tests'
+    required: false
+    default: 'v14'
 
 runs:
   using: "composite"
@@ -68,7 +72,7 @@ runs:
         prefix: latest
 
     - name: Download compatibility snapshot for Postgres 14
-      if: inputs.build_type != 'remote'
+      if: inputs.build_type != 'remote' && inputs.pg_version == 'v14'
       uses: ./.github/actions/download
       with:
         name: compatibility-snapshot-${{ inputs.build_type }}-pg14
@@ -106,13 +110,14 @@ runs:
         ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
         ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
         RERUN_FLAKY: ${{ inputs.rerun_flaky }}
+        PG_VERSION: ${{ inputs.pg_version }}
       shell: bash -euxo pipefail {0}
       run: |
         # PLATFORM will be embedded in the perf test report
         # and it is needed to distinguish different environments
         export PLATFORM=${PLATFORM:-github-actions-selfhosted}
         export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
-        export DEFAULT_PG_VERSION=${DEFAULT_PG_VERSION:-14}
+        export DEFAULT_PG_VERSION=${PG_VERSION#v}
 
         if [ "${BUILD_TYPE}" = "remote" ]; then
           export REMOTE_ENV=1
@@ -193,7 +198,7 @@ runs:
         fi
 
     - name: Upload compatibility snapshot for Postgres 14
-      if: github.ref_name == 'release'
+      if: github.ref_name == 'release' && inputs.pg_version == 'v14'
       uses: ./.github/actions/upload
       with:
         name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 5a09f0b4aa..ef9f171766 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -350,8 +350,8 @@ jobs:
           real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
           real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
           rerun_flaky: true
+          pg_version: ${{ matrix.pg_version }}
         env:
-          DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
 

From b0a77844f60cdcfb84648f2a7d40f0bb85c1f189 Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Fri, 14 Apr 2023 19:41:02 +0300
Subject: [PATCH 54/63] Add SQL-over-HTTP endpoint to Proxy

This commit introduces an SQL-over-HTTP endpoint in the proxy, with a JSON
response structure resembling that of the node-postgres driver. This method,
using HTTP POST, achieves smaller amortized latencies in edge setups due to
fewer round trips and an enhanced open connection reuse by the v8 engine.

This update involves several intricacies:
1. SQL injection protection: We employed the extended query protocol, modifying
   the rust-postgres driver to send queries in one roundtrip using a text
   protocol rather than binary, bypassing potential issues like those identified
   in https://github.com/sfackler/rust-postgres/issues/1030.

2. Postgres type compatibility: As not all postgres types have binary
   representations (e.g., acl's in pg_class), we adjusted rust-postgres to
   respond with text protocol, simplifying serialization and fixing queries with
   text-only types in response.

3. Data type conversion: Considering JSON supports fewer data types than
   Postgres, we perform conversions where possible, passing all other types as
   strings. Key conversions include:
   - postgres int2, int4, float4, float8 -> json number (NaN and Inf remain
     text)
   - postgres bool, null, text -> json bool, null, string
   - postgres array -> json array
   - postgres json and jsonb -> json object

4. Alignment with node-postgres: To facilitate integration with js libraries,
   we've matched the response structure of node-postgres, returning command tags
   and column oids. Command tag capturing was added to the rust-postgres
   functionality as part of this change.
---
 Cargo.lock                                    |  10 +-
 Cargo.toml                                    |  12 +-
 proxy/README.md                               |  86 ++-
 proxy/src/config.rs                           |   5 +-
 proxy/src/http.rs                             |   1 +
 proxy/src/http/sql_over_http.rs               | 603 ++++++++++++++++++
 proxy/src/http/websocket.rs                   |  86 ++-
 test_runner/fixtures/neon_fixtures.py         |  41 +-
 test_runner/regress/test_metric_collection.py |   2 +
 test_runner/regress/test_proxy.py             | 128 +++-
 10 files changed, 909 insertions(+), 65 deletions(-)
 create mode 100644 proxy/src/http/sql_over_http.rs

diff --git a/Cargo.lock b/Cargo.lock
index 55418473d5..4d63ebd99d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2820,7 +2820,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -2833,7 +2833,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "native-tls",
  "tokio",
@@ -2844,7 +2844,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -2862,7 +2862,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4321,7 +4321,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/Cargo.toml b/Cargo.toml
index c901532f86..7895459841 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -126,11 +126,11 @@ env_logger = "0.10"
 log = "0.4"
 
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
 tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
 
 ## Other git libraries
@@ -166,7 +166,7 @@ tonic-build = "0.9"
 
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
 
 # Changes the MAX_THREADS limit from 4096 to 32768.
 # This is a temporary workaround for using tracing from many threads in safekeepers code,
diff --git a/proxy/README.md b/proxy/README.md
index 4ead098b73..cd76a2443f 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -1,6 +1,6 @@
 # Proxy
 
-Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme and cluster routing method. Following backends are currently implemented:
+Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme and cluster routing method. Following routing backends are currently implemented:
 
 * console
   new SCRAM-based console API; uses SNI info to select the destination project (endpoint soon)
@@ -9,6 +9,90 @@ Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme a
 * link
   sends login link for all usernames
 
+Also proxy can expose following services to the external world:
+
+* postgres protocol over TCP -- usual postgres endpoint compatible with usual
+  postgres drivers
+* postgres protocol over WebSockets -- same protocol tunneled over websockets
+  for environments where TCP connection is not available. We have our own
+  implementation of a client that uses node-postgres and tunnels traffic through
+  websockets: https://github.com/neondatabase/serverless
+* SQL over HTTP -- service that accepts POST requests with SQL text over HTTP
+  and responds with JSON-serialised results.
+
+
+## SQL over HTTP
+
+Contrary to the usual postgres proto over TCP and WebSockets using plain
+one-shot HTTP request achieves smaller amortized latencies in edge setups due to
+fewer round trips and an enhanced open connection reuse by the v8 engine. Also
+such endpoint could be used directly without any driver.
+
+To play with it locally one may start proxy over a local postgres installation
+(see end of this page on how to generate certs with openssl):
+
+```
+./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444
+```
+
+If both postgres and proxy are running you may send a SQL query:
+```json
+curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
+  -H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \
+  -H 'Content-Type: application/json' \
+  --data '{
+    "query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num",
+    "params":[ "{{1,2},{\"3\",4}}", {"key":"val", "ikey":4242}]
+  }' | jq
+
+{
+  "command": "SELECT",
+  "fields": [
+    { "dataTypeID": 1007, "name": "arr" },
+    { "dataTypeID": 3802, "name": "obj" },
+    { "dataTypeID": 23, "name": "num" }
+  ],
+  "rowCount": 1,
+  "rows": [
+    {
+      "arr": [[1,2],[3,4]],
+      "num": 42,
+      "obj": {
+        "ikey": 4242,
+        "key": "val"
+      }
+    }
+  ]
+}
+```
+
+
+With the current approach we made the following design decisions:
+
+1. SQL injection protection: We employed the extended query protocol, modifying
+   the rust-postgres driver to send queries in one roundtrip using a text
+   protocol rather than binary, bypassing potential issues like those identified
+   in sfackler/rust-postgres#1030.
+
+2. Postgres type compatibility: As not all postgres types have binary
+   representations (e.g., acl's in pg_class), we adjusted rust-postgres to
+   respond with text protocol, simplifying serialization and fixing queries with
+   text-only types in response.
+
+3. Data type conversion: Considering JSON supports fewer data types than
+   Postgres, we perform conversions where possible, passing all other types as
+   strings. Key conversions include:
+   - postgres int2, int4, float4, float8 -> json number (NaN and Inf remain
+     text)
+   - postgres bool, null, text -> json bool, null, string
+   - postgres array -> json array
+   - postgres json and jsonb -> json object
+
+4. Alignment with node-postgres: To facilitate integration with js libraries,
+   we've matched the response structure of node-postgres, returning command tags
+   and column oids. Command tag capturing was added to the rust-postgres
+   functionality as part of this change.
+
 ## Using SNI-based routing on localhost
 
 Now proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy:
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 530229b3fd..6a26cea78e 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -100,9 +100,10 @@ impl CertResolver {
         is_default: bool,
     ) -> anyhow::Result<()> {
         let priv_key = {
-            let key_bytes = std::fs::read(key_path).context("TLS key file")?;
-            let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
+            let key_bytes = std::fs::read(key_path)
                 .context(format!("Failed to read TLS keys at '{key_path}'"))?;
+            let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
+                .context(format!("Failed to parse TLS keys at '{key_path}'"))?;
 
             ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
             keys.pop().map(rustls::PrivateKey).unwrap()
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index a544157800..5cf49b669c 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -3,6 +3,7 @@
 //! directly relying on deps like `reqwest` (think loose coupling).
 
 pub mod server;
+pub mod sql_over_http;
 pub mod websocket;
 
 pub use reqwest::{Request, Response, StatusCode};
diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
new file mode 100644
index 0000000000..0438a82c12
--- /dev/null
+++ b/proxy/src/http/sql_over_http.rs
@@ -0,0 +1,603 @@
+use futures::pin_mut;
+use futures::StreamExt;
+use hyper::body::HttpBody;
+use hyper::{Body, HeaderMap, Request};
+use pq_proto::StartupMessageParams;
+use serde_json::json;
+use serde_json::Map;
+use serde_json::Value;
+use tokio_postgres::types::Kind;
+use tokio_postgres::types::Type;
+use tokio_postgres::Row;
+use url::Url;
+
+use crate::{auth, config::ProxyConfig, console};
+
+#[derive(serde::Deserialize)]
+struct QueryData {
+    query: String,
+    params: Vec<serde_json::Value>,
+}
+
+const APP_NAME: &str = "sql_over_http";
+const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
+const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
+
+//
+// Convert json non-string types to strings, so that they can be passed to Postgres
+// as parameters.
+//
+fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<String>, serde_json::Error> {
+    json.iter()
+        .map(|value| {
+            match value {
+                Value::Null => serde_json::to_string(value),
+                Value::Bool(_) => serde_json::to_string(value),
+                Value::Number(_) => serde_json::to_string(value),
+                Value::Object(_) => serde_json::to_string(value),
+
+                // no need to escape
+                Value::String(s) => Ok(s.to_string()),
+
+                // special care for arrays
+                Value::Array(_) => json_array_to_pg_array(value),
+            }
+        })
+        .collect()
+}
+
+//
+// Serialize a JSON array to a Postgres array. Contrary to the strings in the params
+// in the array we need to escape the strings. Postgres is okay with arrays of form
+// '{1,"2",3}'::int[], so we don't check that array holds values of the same type, leaving
+// it for Postgres to check.
+//
+// Example of the same escaping in node-postgres: packages/pg/lib/utils.js
+//
+fn json_array_to_pg_array(value: &Value) -> Result<String, serde_json::Error> {
+    match value {
+        // same
+        Value::Null => serde_json::to_string(value),
+        Value::Bool(_) => serde_json::to_string(value),
+        Value::Number(_) => serde_json::to_string(value),
+        Value::Object(_) => serde_json::to_string(value),
+
+        // now needs to be escaped, as it is part of the array
+        Value::String(_) => serde_json::to_string(value),
+
+        // recurse into array
+        Value::Array(arr) => {
+            let vals = arr
+                .iter()
+                .map(json_array_to_pg_array)
+                .collect::<Result<Vec<_>, _>>()?
+                .join(",");
+            Ok(format!("{{{}}}", vals))
+        }
+    }
+}
+
+fn get_conn_info(
+    headers: &HeaderMap,
+    sni_hostname: Option<String>,
+) -> Result<(String, String, String, String), anyhow::Error> {
+    let connection_string = headers
+        .get("Neon-Connection-String")
+        .ok_or(anyhow::anyhow!("missing connection string"))?
+        .to_str()?;
+
+    let connection_url = Url::parse(connection_string)?;
+
+    let protocol = connection_url.scheme();
+    if protocol != "postgres" && protocol != "postgresql" {
+        return Err(anyhow::anyhow!(
+            "connection string must start with postgres: or postgresql:"
+        ));
+    }
+
+    let mut url_path = connection_url
+        .path_segments()
+        .ok_or(anyhow::anyhow!("missing database name"))?;
+
+    let dbname = url_path
+        .next()
+        .ok_or(anyhow::anyhow!("invalid database name"))?;
+
+    let username = connection_url.username();
+    if username.is_empty() {
+        return Err(anyhow::anyhow!("missing username"));
+    }
+
+    let password = connection_url
+        .password()
+        .ok_or(anyhow::anyhow!("no password"))?;
+
+    // TLS certificate selector now based on SNI hostname, so if we are running here
+    // we are sure that SNI hostname is set to one of the configured domain names.
+    let sni_hostname = sni_hostname.ok_or(anyhow::anyhow!("no SNI hostname set"))?;
+
+    let hostname = connection_url
+        .host_str()
+        .ok_or(anyhow::anyhow!("no host"))?;
+
+    let host_header = headers
+        .get("host")
+        .and_then(|h| h.to_str().ok())
+        .and_then(|h| h.split(':').next());
+
+    if hostname != sni_hostname {
+        return Err(anyhow::anyhow!("mismatched SNI hostname and hostname"));
+    } else if let Some(h) = host_header {
+        if h != hostname {
+            return Err(anyhow::anyhow!("mismatched host header and hostname"));
+        }
+    }
+
+    Ok((
+        username.to_owned(),
+        dbname.to_owned(),
+        hostname.to_owned(),
+        password.to_owned(),
+    ))
+}
+
+// TODO: return different http error codes
+pub async fn handle(
+    config: &'static ProxyConfig,
+    request: Request<Body>,
+    sni_hostname: Option<String>,
+) -> anyhow::Result<Value> {
+    //
+    // Determine the destination and connection params
+    //
+    let headers = request.headers();
+    let (username, dbname, hostname, password) = get_conn_info(headers, sni_hostname)?;
+    let credential_params = StartupMessageParams::new([
+        ("user", &username),
+        ("database", &dbname),
+        ("application_name", APP_NAME),
+    ]);
+
+    //
+    // Wake up the destination if needed. Code here is a bit involved because
+    // we reuse the code from the usual proxy and we need to prepare few structures
+    // that this code expects.
+    //
+    let tls = config.tls_config.as_ref();
+    let common_names = tls.and_then(|tls| tls.common_names.clone());
+    let creds = config
+        .auth_backend
+        .as_ref()
+        .map(|_| auth::ClientCredentials::parse(&credential_params, Some(&hostname), common_names))
+        .transpose()?;
+    let extra = console::ConsoleReqExtra {
+        session_id: uuid::Uuid::new_v4(),
+        application_name: Some(APP_NAME),
+    };
+    let node = creds.wake_compute(&extra).await?.expect("msg");
+    let conf = node.value.config;
+    let port = *conf.get_ports().first().expect("no port");
+    let host = match conf.get_hosts().first().expect("no host") {
+        tokio_postgres::config::Host::Tcp(host) => host,
+        tokio_postgres::config::Host::Unix(_) => {
+            return Err(anyhow::anyhow!("unix socket is not supported"));
+        }
+    };
+
+    let request_content_length = match request.body().size_hint().upper() {
+        Some(v) => v,
+        None => MAX_REQUEST_SIZE + 1,
+    };
+
+    if request_content_length > MAX_REQUEST_SIZE {
+        return Err(anyhow::anyhow!(
+            "request is too large (max {MAX_REQUEST_SIZE} bytes)"
+        ));
+    }
+
+    //
+    // Read the query and query params from the request body
+    //
+    let body = hyper::body::to_bytes(request.into_body()).await?;
+    let QueryData { query, params } = serde_json::from_slice(&body)?;
+    let query_params = json_to_pg_text(params)?;
+
+    //
+    // Connenct to the destination
+    //
+    let (client, connection) = tokio_postgres::Config::new()
+        .host(host)
+        .port(port)
+        .user(&username)
+        .password(&password)
+        .dbname(&dbname)
+        .max_backend_message_size(MAX_RESPONSE_SIZE)
+        .connect(tokio_postgres::NoTls)
+        .await?;
+
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    //
+    // Now execute the query and return the result
+    //
+    let row_stream = client.query_raw_txt(query, query_params).await?;
+
+    // Manually drain the stream into a vector to leave row_stream hanging
+    // around to get a command tag. Also check that the response is not too
+    // big.
+    pin_mut!(row_stream);
+    let mut rows: Vec<tokio_postgres::Row> = Vec::new();
+    let mut curret_size = 0;
+    while let Some(row) = row_stream.next().await {
+        let row = row?;
+        curret_size += row.body_len();
+        rows.push(row);
+        if curret_size > MAX_RESPONSE_SIZE {
+            return Err(anyhow::anyhow!("response too large"));
+        }
+    }
+
+    // grab the command tag and number of rows affected
+    let command_tag = row_stream.command_tag().unwrap_or_default();
+    let mut command_tag_split = command_tag.split(' ');
+    let command_tag_name = command_tag_split.next().unwrap_or_default();
+    let command_tag_count = if command_tag_name == "INSERT" {
+        // INSERT returns OID first and then number of rows
+        command_tag_split.nth(1)
+    } else {
+        // other commands return number of rows (if any)
+        command_tag_split.next()
+    }
+    .and_then(|s| s.parse::<i64>().ok());
+
+    let fields = if !rows.is_empty() {
+        rows[0]
+            .columns()
+            .iter()
+            .map(|c| {
+                json!({
+                    "name": Value::String(c.name().to_owned()),
+                    "dataTypeID": Value::Number(c.type_().oid().into()),
+                })
+            })
+            .collect::<Vec<_>>()
+    } else {
+        Vec::new()
+    };
+
+    // convert rows to JSON
+    let rows = rows
+        .iter()
+        .map(pg_text_row_to_json)
+        .collect::<Result<Vec<_>, _>>()?;
+
+    // resulting JSON format is based on the format of node-postgres result
+    Ok(json!({
+        "command": command_tag_name,
+        "rowCount": command_tag_count,
+        "rows": rows,
+        "fields": fields,
+    }))
+}
+
+//
+// Convert postgres row with text-encoded values to JSON object
+//
+pub fn pg_text_row_to_json(row: &Row) -> Result<Value, anyhow::Error> {
+    let res = row
+        .columns()
+        .iter()
+        .enumerate()
+        .map(|(i, column)| {
+            let name = column.name();
+            let pg_value = row.as_text(i)?;
+            let json_value = pg_text_to_json(pg_value, column.type_())?;
+            Ok((name.to_string(), json_value))
+        })
+        .collect::<Result<Map<String, Value>, anyhow::Error>>()?;
+
+    Ok(Value::Object(res))
+}
+
+//
+// Convert postgres text-encoded value to JSON value
+//
+pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
+    if let Some(val) = pg_value {
+        if val == "NULL" {
+            return Ok(Value::Null);
+        }
+
+        if let Kind::Array(elem_type) = pg_type.kind() {
+            return pg_array_parse(val, elem_type);
+        }
+
+        match *pg_type {
+            Type::BOOL => Ok(Value::Bool(val == "t")),
+            Type::INT2 | Type::INT4 => {
+                let val = val.parse::<i32>()?;
+                Ok(Value::Number(serde_json::Number::from(val)))
+            }
+            Type::FLOAT4 | Type::FLOAT8 => {
+                let fval = val.parse::<f64>()?;
+                let num = serde_json::Number::from_f64(fval);
+                if let Some(num) = num {
+                    Ok(Value::Number(num))
+                } else {
+                    // Pass Nan, Inf, -Inf as strings
+                    // JS JSON.stringify() does converts them to null, but we
+                    // want to preserve them, so we pass them as strings
+                    Ok(Value::String(val.to_string()))
+                }
+            }
+            Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?),
+            _ => Ok(Value::String(val.to_string())),
+        }
+    } else {
+        Ok(Value::Null)
+    }
+}
+
+//
+// Parse postgres array into JSON array.
+//
+// This is a bit involved because we need to handle nested arrays and quoted
+// values. Unlike postgres we don't check that all nested arrays have the same
+// dimensions, we just return them as is.
+//
+fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result<Value, anyhow::Error> {
+    _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v)
+}
+
+fn _pg_array_parse(
+    pg_array: &str,
+    elem_type: &Type,
+    nested: bool,
+) -> Result<(Value, usize), anyhow::Error> {
+    let mut pg_array_chr = pg_array.char_indices();
+    let mut level = 0;
+    let mut quote = false;
+    let mut entries: Vec<Value> = Vec::new();
+    let mut entry = String::new();
+
+    // skip bounds decoration
+    if let Some('[') = pg_array.chars().next() {
+        for (_, c) in pg_array_chr.by_ref() {
+            if c == '=' {
+                break;
+            }
+        }
+    }
+
+    while let Some((mut i, mut c)) = pg_array_chr.next() {
+        let mut escaped = false;
+
+        if c == '\\' {
+            escaped = true;
+            (i, c) = pg_array_chr.next().unwrap();
+        }
+
+        match c {
+            '{' if !quote => {
+                level += 1;
+                if level > 1 {
+                    let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?;
+                    entries.push(res);
+                    for _ in 0..off - 1 {
+                        pg_array_chr.next();
+                    }
+                }
+            }
+            '}' => {
+                level -= 1;
+                if level == 0 {
+                    if !entry.is_empty() {
+                        entries.push(pg_text_to_json(Some(&entry), elem_type)?);
+                    }
+                    if nested {
+                        return Ok((Value::Array(entries), i));
+                    }
+                }
+            }
+            '"' if !escaped => {
+                if quote {
+                    // push even if empty
+                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
+                    entry = String::new();
+                }
+                quote = !quote;
+            }
+            ',' if !quote => {
+                if !entry.is_empty() {
+                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
+                    entry = String::new();
+                }
+            }
+            _ => {
+                entry.push(c);
+            }
+        }
+    }
+
+    if level != 0 {
+        return Err(anyhow::anyhow!("unbalanced array"));
+    }
+
+    Ok((Value::Array(entries), 0))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    #[test]
+    fn test_atomic_types_to_pg_params() {
+        let json = vec![Value::Bool(true), Value::Bool(false)];
+        let pg_params = json_to_pg_text(json).unwrap();
+        assert_eq!(pg_params, vec!["true", "false"]);
+
+        let json = vec![Value::Number(serde_json::Number::from(42))];
+        let pg_params = json_to_pg_text(json).unwrap();
+        assert_eq!(pg_params, vec!["42"]);
+
+        let json = vec![Value::String("foo\"".to_string())];
+        let pg_params = json_to_pg_text(json).unwrap();
+        assert_eq!(pg_params, vec!["foo\""]);
+
+        let json = vec![Value::Null];
+        let pg_params = json_to_pg_text(json).unwrap();
+        assert_eq!(pg_params, vec!["null"]);
+    }
+
+    #[test]
+    fn test_json_array_to_pg_array() {
+        // atoms and escaping
+        let json = "[true, false, null, 42, \"foo\", \"bar\\\"-\\\\\"]";
+        let json: Value = serde_json::from_str(json).unwrap();
+        let pg_params = json_to_pg_text(vec![json]).unwrap();
+        assert_eq!(
+            pg_params,
+            vec!["{true,false,null,42,\"foo\",\"bar\\\"-\\\\\"}"]
+        );
+
+        // nested arrays
+        let json = "[[true, false], [null, 42], [\"foo\", \"bar\\\"-\\\\\"]]";
+        let json: Value = serde_json::from_str(json).unwrap();
+        let pg_params = json_to_pg_text(vec![json]).unwrap();
+        assert_eq!(
+            pg_params,
+            vec!["{{true,false},{null,42},{\"foo\",\"bar\\\"-\\\\\"}}"]
+        );
+    }
+
+    #[test]
+    fn test_atomic_types_parse() {
+        assert_eq!(
+            pg_text_to_json(Some("foo"), &Type::TEXT).unwrap(),
+            json!("foo")
+        );
+        assert_eq!(pg_text_to_json(None, &Type::TEXT).unwrap(), json!(null));
+        assert_eq!(pg_text_to_json(Some("42"), &Type::INT4).unwrap(), json!(42));
+        assert_eq!(pg_text_to_json(Some("42"), &Type::INT2).unwrap(), json!(42));
+        assert_eq!(
+            pg_text_to_json(Some("42"), &Type::INT8).unwrap(),
+            json!("42")
+        );
+        assert_eq!(
+            pg_text_to_json(Some("42.42"), &Type::FLOAT8).unwrap(),
+            json!(42.42)
+        );
+        assert_eq!(
+            pg_text_to_json(Some("42.42"), &Type::FLOAT4).unwrap(),
+            json!(42.42)
+        );
+        assert_eq!(
+            pg_text_to_json(Some("NaN"), &Type::FLOAT4).unwrap(),
+            json!("NaN")
+        );
+        assert_eq!(
+            pg_text_to_json(Some("Infinity"), &Type::FLOAT4).unwrap(),
+            json!("Infinity")
+        );
+        assert_eq!(
+            pg_text_to_json(Some("-Infinity"), &Type::FLOAT4).unwrap(),
+            json!("-Infinity")
+        );
+
+        let json: Value =
+            serde_json::from_str("{\"s\":\"str\",\"n\":42,\"f\":4.2,\"a\":[null,3,\"a\"]}")
+                .unwrap();
+        assert_eq!(
+            pg_text_to_json(
+                Some(r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#),
+                &Type::JSONB
+            )
+            .unwrap(),
+            json
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_text() {
+        fn pt(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::TEXT).unwrap()
+        }
+        assert_eq!(
+            pt(r#"{"aa\"\\\,a",cha,"bbbb"}"#),
+            json!(["aa\"\\,a", "cha", "bbbb"])
+        );
+        assert_eq!(
+            pt(r#"{{"foo","bar"},{"bee","bop"}}"#),
+            json!([["foo", "bar"], ["bee", "bop"]])
+        );
+        assert_eq!(
+            pt(r#"{{{{"foo",NULL,"bop",bup}}}}"#),
+            json!([[[["foo", null, "bop", "bup"]]]])
+        );
+        assert_eq!(
+            pt(r#"{{"1",2,3},{4,NULL,6},{NULL,NULL,NULL}}"#),
+            json!([["1", "2", "3"], ["4", null, "6"], [null, null, null]])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_bool() {
+        fn pb(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::BOOL).unwrap()
+        }
+        assert_eq!(pb(r#"{t,f,t}"#), json!([true, false, true]));
+        assert_eq!(pb(r#"{{t,f,t}}"#), json!([[true, false, true]]));
+        assert_eq!(
+            pb(r#"{{t,f},{f,t}}"#),
+            json!([[true, false], [false, true]])
+        );
+        assert_eq!(
+            pb(r#"{{t,NULL},{NULL,f}}"#),
+            json!([[true, null], [null, false]])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_numbers() {
+        fn pn(pg_arr: &str, ty: &Type) -> Value {
+            pg_array_parse(pg_arr, ty).unwrap()
+        }
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT4), json!([1, 2, 3]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT2), json!([1, 2, 3]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT8), json!(["1", "2", "3"]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT4), json!([1.0, 2.0, 3.0]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT8), json!([1.0, 2.0, 3.0]));
+        assert_eq!(
+            pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT4),
+            json!([1.1, 2.2, 3.3])
+        );
+        assert_eq!(
+            pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT8),
+            json!([1.1, 2.2, 3.3])
+        );
+        assert_eq!(
+            pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT4),
+            json!(["NaN", "Infinity", "-Infinity"])
+        );
+        assert_eq!(
+            pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT8),
+            json!(["NaN", "Infinity", "-Infinity"])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_with_decoration() {
+        fn p(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::INT2).unwrap()
+        }
+        assert_eq!(
+            p(r#"[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}"#),
+            json!([[[1, 2, 3], [4, 5, 6]]])
+        );
+    }
+}
diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
index c7676e8e14..fbb602e3d2 100644
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -4,12 +4,17 @@ use crate::{
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
 use hyper::{
-    server::{accept, conn::AddrIncoming},
+    server::{
+        accept,
+        conn::{AddrIncoming, AddrStream},
+    },
     upgrade::Upgraded,
-    Body, Request, Response, StatusCode,
+    Body, Method, Request, Response, StatusCode,
 };
 use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
 use pin_project_lite::pin_project;
+use serde_json::{json, Value};
+
 use std::{
     convert::Infallible,
     future::ready,
@@ -21,6 +26,7 @@ use tls_listener::TlsListener;
 use tokio::{
     io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf},
     net::TcpListener,
+    select,
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -30,6 +36,8 @@ use utils::http::{error::ApiError, json::json_response};
 // Tracking issue: https://github.com/rust-lang/rust/issues/98407.
 use sync_wrapper::SyncWrapper;
 
+use super::sql_over_http;
+
 pin_project! {
     /// This is a wrapper around a [`WebSocketStream`] that
     /// implements [`AsyncRead`] and [`AsyncWrite`].
@@ -159,6 +167,7 @@ async fn ws_handler(
     config: &'static ProxyConfig,
     cancel_map: Arc<CancelMap>,
     session_id: uuid::Uuid,
+    sni_hostname: Option<String>,
 ) -> Result<Response<Body>, ApiError> {
     let host = request
         .headers()
@@ -181,8 +190,44 @@ async fn ws_handler(
 
         // Return the response so the spawned future can continue.
         Ok(response)
+    // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
+    // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
+    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
+        let result = select! {
+            _ = tokio::time::sleep(std::time::Duration::from_secs(10)) => {
+                Err(anyhow::anyhow!("Query timed out"))
+            }
+            response = sql_over_http::handle(config, request, sni_hostname) => {
+                response
+            }
+        };
+        let status_code = match result {
+            Ok(_) => StatusCode::OK,
+            Err(_) => StatusCode::BAD_REQUEST,
+        };
+        let json = match result {
+            Ok(r) => r,
+            Err(e) => {
+                let message = format!("{:?}", e);
+                let code = match e.downcast_ref::<tokio_postgres::Error>() {
+                    Some(e) => match e.code() {
+                        Some(e) => serde_json::to_value(e.code()).unwrap(),
+                        None => Value::Null,
+                    },
+                    None => Value::Null,
+                };
+                json!({ "message": message, "code": code })
+            }
+        };
+        json_response(status_code, json).map(|mut r| {
+            r.headers_mut().insert(
+                "Access-Control-Allow-Origin",
+                hyper::http::HeaderValue::from_static("*"),
+            );
+            r
+        })
     } else {
-        json_response(StatusCode::OK, "Connect with a websocket client")
+        json_response(StatusCode::BAD_REQUEST, "query is not supported")
     }
 }
 
@@ -216,20 +261,27 @@ pub async fn task_main(
         }
     });
 
-    let make_svc = hyper::service::make_service_fn(|_stream| async move {
-        Ok::<_, Infallible>(hyper::service::service_fn(
-            move |req: Request<Body>| async move {
-                let cancel_map = Arc::new(CancelMap::default());
-                let session_id = uuid::Uuid::new_v4();
-                ws_handler(req, config, cancel_map, session_id)
-                    .instrument(info_span!(
-                        "ws-client",
-                        session = format_args!("{session_id}")
-                    ))
-                    .await
-            },
-        ))
-    });
+    let make_svc =
+        hyper::service::make_service_fn(|stream: &tokio_rustls::server::TlsStream<AddrStream>| {
+            let sni_name = stream.get_ref().1.sni_hostname().map(|s| s.to_string());
+
+            async move {
+                Ok::<_, Infallible>(hyper::service::service_fn(move |req: Request<Body>| {
+                    let sni_name = sni_name.clone();
+                    async move {
+                        let cancel_map = Arc::new(CancelMap::default());
+                        let session_id = uuid::Uuid::new_v4();
+
+                        ws_handler(req, config, cancel_map, session_id, sni_name)
+                            .instrument(info_span!(
+                                "ws-client",
+                                session = format_args!("{session_id}")
+                            ))
+                            .await
+                    }
+                }))
+            }
+        });
 
     hyper::Server::builder(accept::from_stream(tls_listener))
         .serve(make_svc)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8ec17834ac..bde91e6783 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2042,15 +2042,19 @@ class NeonProxy(PgProtocol):
         proxy_port: int,
         http_port: int,
         mgmt_port: int,
+        external_http_port: int,
         auth_backend: NeonProxy.AuthBackend,
         metric_collection_endpoint: Optional[str] = None,
         metric_collection_interval: Optional[str] = None,
     ):
         host = "127.0.0.1"
-        super().__init__(dsn=auth_backend.default_conn_url, host=host, port=proxy_port)
+        domain = "proxy.localtest.me"  # resolves to 127.0.0.1
+        super().__init__(dsn=auth_backend.default_conn_url, host=domain, port=proxy_port)
 
+        self.domain = domain
         self.host = host
         self.http_port = http_port
+        self.external_http_port = external_http_port
         self.neon_binpath = neon_binpath
         self.test_output_dir = test_output_dir
         self.proxy_port = proxy_port
@@ -2062,11 +2066,42 @@ class NeonProxy(PgProtocol):
 
     def start(self) -> NeonProxy:
         assert self._popen is None
+
+        # generate key of it doesn't exist
+        crt_path = self.test_output_dir / "proxy.crt"
+        key_path = self.test_output_dir / "proxy.key"
+
+        if not key_path.exists():
+            r = subprocess.run(
+                [
+                    "openssl",
+                    "req",
+                    "-new",
+                    "-x509",
+                    "-days",
+                    "365",
+                    "-nodes",
+                    "-text",
+                    "-out",
+                    str(crt_path),
+                    "-keyout",
+                    str(key_path),
+                    "-subj",
+                    "/CN=*.localtest.me",
+                    "-addext",
+                    "subjectAltName = DNS:*.localtest.me",
+                ]
+            )
+            assert r.returncode == 0
+
         args = [
             str(self.neon_binpath / "proxy"),
             *["--http", f"{self.host}:{self.http_port}"],
             *["--proxy", f"{self.host}:{self.proxy_port}"],
             *["--mgmt", f"{self.host}:{self.mgmt_port}"],
+            *["--wss", f"{self.host}:{self.external_http_port}"],
+            *["-c", str(crt_path)],
+            *["-k", str(key_path)],
             *self.auth_backend.extra_args(),
         ]
 
@@ -2190,6 +2225,7 @@ def link_proxy(
     http_port = port_distributor.get_port()
     proxy_port = port_distributor.get_port()
     mgmt_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
 
     with NeonProxy(
         neon_binpath=neon_binpath,
@@ -2197,6 +2233,7 @@ def link_proxy(
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
+        external_http_port=external_http_port,
         auth_backend=NeonProxy.Link(),
     ) as proxy:
         proxy.start()
@@ -2224,6 +2261,7 @@ def static_proxy(
     proxy_port = port_distributor.get_port()
     mgmt_port = port_distributor.get_port()
     http_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
 
     with NeonProxy(
         neon_binpath=neon_binpath,
@@ -2231,6 +2269,7 @@ def static_proxy(
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
+        external_http_port=external_http_port,
         auth_backend=NeonProxy.Postgres(auth_endpoint),
     ) as proxy:
         proxy.start()
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index 1231188896..00ea77f2e7 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -204,6 +204,7 @@ def proxy_with_metric_collector(
     http_port = port_distributor.get_port()
     proxy_port = port_distributor.get_port()
     mgmt_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
 
     (host, port) = httpserver_listen_address
     metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
@@ -215,6 +216,7 @@ def proxy_with_metric_collector(
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
+        external_http_port=external_http_port,
         metric_collection_endpoint=metric_collection_endpoint,
         metric_collection_interval=metric_collection_interval,
         auth_backend=NeonProxy.Link(),
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index ae914e384e..6be3995714 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -1,22 +1,32 @@
+import json
 import subprocess
+from typing import Any, List
 
 import psycopg2
 import pytest
+import requests
 from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres
 
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
-def test_proxy_select_1(static_proxy: NeonProxy, option_name: str):
+def test_proxy_select_1(static_proxy: NeonProxy):
     """
     A simplest smoke test: check proxy against a local postgres instance.
     """
 
-    out = static_proxy.safe_psql("select 1", options=f"{option_name}=generic-project-name")
+    # no SNI, deprecated `options=project` syntax (before we had several endpoint in project)
+    out = static_proxy.safe_psql("select 1", sslsni=0, options="project=generic-project-name")
     assert out[0][0] == 1
 
+    # no SNI, new `options=endpoint` syntax
+    out = static_proxy.safe_psql("select 1", sslsni=0, options="endpoint=generic-project-name")
+    assert out[0][0] == 1
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
-def test_password_hack(static_proxy: NeonProxy, option_name: str):
+    # with SNI
+    out = static_proxy.safe_psql("select 42", host="generic-project-name.localtest.me")
+    assert out[0][0] == 42
+
+
+def test_password_hack(static_proxy: NeonProxy):
     """
     Check the PasswordHack auth flow: an alternative to SCRAM auth for
     clients which can't provide the project/endpoint name via SNI or `options`.
@@ -24,14 +34,16 @@ def test_password_hack(static_proxy: NeonProxy, option_name: str):
 
     user = "borat"
     password = "password"
-    static_proxy.safe_psql(
-        f"create role {user} with login password '{password}'",
-        options=f"{option_name}=irrelevant",
-    )
+    static_proxy.safe_psql(f"create role {user} with login password '{password}'")
 
     # Note the format of `magic`!
-    magic = f"{option_name}=irrelevant;{password}"
-    static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)
+    magic = f"project=irrelevant;{password}"
+    out = static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)
+    assert out[0][0] == 1
+
+    magic = f"endpoint=irrelevant;{password}"
+    out = static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)
+    assert out[0][0] == 1
 
     # Must also check that invalid magic won't be accepted.
     with pytest.raises(psycopg2.OperationalError):
@@ -69,52 +81,55 @@ def test_proxy_options(static_proxy: NeonProxy, option_name: str):
     """
 
     options = f"{option_name}=irrelevant -cproxytest.option=value"
-    out = static_proxy.safe_psql("show proxytest.option", options=options)
+    out = static_proxy.safe_psql("show proxytest.option", options=options, sslsni=0)
     assert out[0][0] == "value"
 
     options = f"-c proxytest.foo=\\ str {option_name}=irrelevant"
+    out = static_proxy.safe_psql("show proxytest.foo", options=options, sslsni=0)
+    assert out[0][0] == " str"
+
+    options = "-cproxytest.option=value"
+    out = static_proxy.safe_psql("show proxytest.option", options=options)
+    assert out[0][0] == "value"
+
+    options = "-c proxytest.foo=\\ str"
     out = static_proxy.safe_psql("show proxytest.foo", options=options)
     assert out[0][0] == " str"
 
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
-def test_auth_errors(static_proxy: NeonProxy, option_name: str):
+def test_auth_errors(static_proxy: NeonProxy):
     """
     Check that we throw very specific errors in some unsuccessful auth scenarios.
     """
 
     # User does not exist
     with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", options=f"{option_name}=irrelevant")
+        static_proxy.connect(user="pinocchio")
     text = str(exprinfo.value).strip()
-    assert text.endswith("password authentication failed for user 'pinocchio'")
+    assert text.find("password authentication failed for user 'pinocchio'") != -1
 
     static_proxy.safe_psql(
         "create role pinocchio with login password 'magic'",
-        options=f"{option_name}=irrelevant",
     )
 
     # User exists, but password is missing
     with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", password=None, options=f"{option_name}=irrelevant")
+        static_proxy.connect(user="pinocchio", password=None)
     text = str(exprinfo.value).strip()
-    assert text.endswith("password authentication failed for user 'pinocchio'")
+    assert text.find("password authentication failed for user 'pinocchio'") != -1
 
     # User exists, but password is wrong
     with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", password="bad", options=f"{option_name}=irrelevant")
+        static_proxy.connect(user="pinocchio", password="bad")
     text = str(exprinfo.value).strip()
-    assert text.endswith("password authentication failed for user 'pinocchio'")
+    assert text.find("password authentication failed for user 'pinocchio'") != -1
 
     # Finally, check that the user can connect
-    with static_proxy.connect(
-        user="pinocchio", password="magic", options=f"{option_name}=irrelevant"
-    ):
+    with static_proxy.connect(user="pinocchio", password="magic"):
         pass
 
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
-def test_forward_params_to_client(static_proxy: NeonProxy, option_name: str):
+def test_forward_params_to_client(static_proxy: NeonProxy):
     """
     Check that we forward all necessary PostgreSQL server params to client.
     """
@@ -140,7 +155,7 @@ def test_forward_params_to_client(static_proxy: NeonProxy, option_name: str):
         where name = any(%s)
     """
 
-    with static_proxy.connect(options=f"{option_name}=irrelevant") as conn:
+    with static_proxy.connect() as conn:
         with conn.cursor() as cur:
             cur.execute(query, (reported_params_subset,))
             for name, value in cur.fetchall():
@@ -148,18 +163,65 @@ def test_forward_params_to_client(static_proxy: NeonProxy, option_name: str):
                 assert conn.get_parameter_status(name) == value
 
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
 @pytest.mark.timeout(5)
-def test_close_on_connections_exit(static_proxy: NeonProxy, option_name: str):
+def test_close_on_connections_exit(static_proxy: NeonProxy):
     # Open two connections, send SIGTERM, then ensure that proxy doesn't exit
     # until after connections close.
-    with static_proxy.connect(options=f"{option_name}=irrelevant"), static_proxy.connect(
-        options=f"{option_name}=irrelevant"
-    ):
+    with static_proxy.connect(), static_proxy.connect():
         static_proxy.terminate()
         with pytest.raises(subprocess.TimeoutExpired):
             static_proxy.wait_for_exit(timeout=2)
         # Ensure we don't accept any more connections
         with pytest.raises(psycopg2.OperationalError):
-            static_proxy.connect(options=f"{option_name}=irrelevant")
+            static_proxy.connect()
     static_proxy.wait_for_exit()
+
+
+def test_sql_over_http(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http with login password 'http' superuser")
+
+    def q(sql: str, params: List[Any] = []) -> Any:
+        connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
+        response = requests.post(
+            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+            data=json.dumps({"query": sql, "params": params}),
+            headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr},
+            verify=str(static_proxy.test_output_dir / "proxy.crt"),
+        )
+        assert response.status_code == 200
+        return response.json()
+
+    rows = q("select 42 as answer")["rows"]
+    assert rows == [{"answer": 42}]
+
+    rows = q("select $1 as answer", [42])["rows"]
+    assert rows == [{"answer": "42"}]
+
+    rows = q("select $1 * 1 as answer", [42])["rows"]
+    assert rows == [{"answer": 42}]
+
+    rows = q("select $1::int[] as answer", [[1, 2, 3]])["rows"]
+    assert rows == [{"answer": [1, 2, 3]}]
+
+    rows = q("select $1::json->'a' as answer", [{"a": {"b": 42}}])["rows"]
+    assert rows == [{"answer": {"b": 42}}]
+
+    rows = q("select * from pg_class limit 1")["rows"]
+    assert len(rows) == 1
+
+    res = q("create table t(id serial primary key, val int)")
+    assert res["command"] == "CREATE"
+    assert res["rowCount"] is None
+
+    res = q("insert into t(val) values (10), (20), (30) returning id")
+    assert res["command"] == "INSERT"
+    assert res["rowCount"] == 3
+    assert res["rows"] == [{"id": 1}, {"id": 2}, {"id": 3}]
+
+    res = q("select * from t")
+    assert res["command"] == "SELECT"
+    assert res["rowCount"] == 3
+
+    res = q("drop table t")
+    assert res["command"] == "DROP"
+    assert res["rowCount"] is None

From a475cdf642df08501c3c2035bcee454da68a3dfa Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 13 Jun 2023 13:34:56 +0200
Subject: [PATCH 55/63] [compute_ctl] Fix logging if catalog updates are
 skipped (#4480)

Otherwise, it wasn't clear from the log when Postgres started up
completely if catalog updates were skipped.

Follow-up for 4936ab6
---
 compute_tools/src/compute.rs | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 977708a18f..94cebf93de 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -370,11 +370,6 @@ impl ComputeNode {
         // 'Close' connection
         drop(client);
 
-        info!(
-            "finished configuration of compute for project {}",
-            spec.cluster.cluster_id.as_deref().unwrap_or("None")
-        );
-
         Ok(())
     }
 
@@ -427,22 +422,22 @@ impl ComputeNode {
     #[instrument(skip(self))]
     pub fn start_compute(&self) -> Result<std::process::Child> {
         let compute_state = self.state.lock().unwrap().clone();
-        let spec = compute_state.pspec.as_ref().expect("spec must be set");
+        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
         info!(
             "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            spec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
-            spec.spec.operation_uuid.as_deref().unwrap_or("None"),
-            spec.tenant_id,
-            spec.timeline_id,
+            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
+            pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
+            pspec.tenant_id,
+            pspec.timeline_id,
         );
 
         self.prepare_pgdata(&compute_state)?;
 
         let start_time = Utc::now();
 
-        let pg = self.start_postgres(spec.storage_auth_token.clone())?;
+        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
 
-        if spec.spec.mode == ComputeMode::Primary && !spec.spec.skip_pg_catalog_updates {
+        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
             self.apply_config(&compute_state)?;
         }
 
@@ -462,6 +457,11 @@ impl ComputeNode {
         }
         self.set_status(ComputeStatus::Running);
 
+        info!(
+            "finished configuration of compute for project {}",
+            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
+        );
+
         Ok(pg)
     }
 

From e437787c8fdefc9859a37c69c31d0bf336f84aa9 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 21 Jun 2023 15:50:52 +0300
Subject: [PATCH 56/63] cargo update -p openssl (#4542)

To unblock release
https://github.com/neondatabase/neon/pull/4536#issuecomment-1600678054

Context: https://rustsec.org/advisories/RUSTSEC-2023-0044
---
 Cargo.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 71a6699c50..4be74614c2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2349,9 +2349,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
 
 [[package]]
 name = "openssl"
-version = "0.10.52"
+version = "0.10.55"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56"
+checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d"
 dependencies = [
  "bitflags",
  "cfg-if",
@@ -2381,9 +2381,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.87"
+version = "0.9.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e"
+checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6"
 dependencies = [
  "cc",
  "libc",

From 6bc756129065cb3d2e1be92289fce749e2920891 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <me@cschwarz.com>
Date: Fri, 23 Jun 2023 20:43:20 +0200
Subject: [PATCH 57/63] don't use MGMT_REQUEST_RUNTIME for consumption metrics
 synthetic size worker

The consumption metrics synthetic size worker does logical size calculation.
Logical size calculation currently does synchronous disk IO.
This blocks the MGMT_REQUEST_RUNTIME's executor threads, starving other futures.

While there's work on the way to move the synchronous disk IO into spawn_blocking,
the quickfix here is to use the BACKGROUND_RUNTIME instead of MGMT_REQUEST_RUNTIME.

Actually it's not just a quickfix. We simply shouldn't be blocking MGMT_REQUEST_RUNTIME
executor threads on CPU or sync disk IO.
That work isn't done yet, as many of the mgmt tasks still _do_ disk IO.
But it's not as intensive as the logical size calculations that we're fixing here.

While we're at it, fix disk-usage-based eviction in a similar way.
It wasn't the culprit here, according to prod logs, but it can theoretically be
a little CPU-intensive.

More context, including graphs from Prod:
https://neondb.slack.com/archives/C03F5SM1N02/p1687541681336949

(cherry picked from commit d6e35222ea592428b78401ff0053b51424674e03)
---
 pageserver/src/bin/pageserver.rs | 82 ++++++++++++++++----------------
 pageserver/src/http/routes.rs    |  4 +-
 2 files changed, 42 insertions(+), 44 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 1fa5e4ab3b..b01ace63e4 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -495,50 +495,50 @@ fn start_pageserver(
                 Ok(())
             },
         );
+    }
 
-        if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
-            let background_jobs_barrier = background_jobs_barrier;
-            let metrics_ctx = RequestContext::todo_child(
-                TaskKind::MetricsCollection,
-                // This task itself shouldn't download anything.
-                // The actual size calculation does need downloads, and
-                // creates a child context with the right DownloadBehavior.
-                DownloadBehavior::Error,
-            );
-            task_mgr::spawn(
-                MGMT_REQUEST_RUNTIME.handle(),
-                TaskKind::MetricsCollection,
-                None,
-                None,
-                "consumption metrics collection",
-                true,
-                async move {
-                    // first wait until background jobs are cleared to launch.
-                    //
-                    // this is because we only process active tenants and timelines, and the
-                    // Timeline::get_current_logical_size will spawn the logical size calculation,
-                    // which will not be rate-limited.
-                    let cancel = task_mgr::shutdown_token();
+    if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+        let background_jobs_barrier = background_jobs_barrier;
+        let metrics_ctx = RequestContext::todo_child(
+            TaskKind::MetricsCollection,
+            // This task itself shouldn't download anything.
+            // The actual size calculation does need downloads, and
+            // creates a child context with the right DownloadBehavior.
+            DownloadBehavior::Error,
+        );
+        task_mgr::spawn(
+            crate::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MetricsCollection,
+            None,
+            None,
+            "consumption metrics collection",
+            true,
+            async move {
+                // first wait until background jobs are cleared to launch.
+                //
+                // this is because we only process active tenants and timelines, and the
+                // Timeline::get_current_logical_size will spawn the logical size calculation,
+                // which will not be rate-limited.
+                let cancel = task_mgr::shutdown_token();
 
-                    tokio::select! {
-                        _ = cancel.cancelled() => { return Ok(()); },
-                        _ = background_jobs_barrier.wait() => {}
-                    };
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()); },
+                    _ = background_jobs_barrier.wait() => {}
+                };
 
-                    pageserver::consumption_metrics::collect_metrics(
-                        metric_collection_endpoint,
-                        conf.metric_collection_interval,
-                        conf.cached_metric_collection_interval,
-                        conf.synthetic_size_calculation_interval,
-                        conf.id,
-                        metrics_ctx,
-                    )
-                    .instrument(info_span!("metrics_collection"))
-                    .await?;
-                    Ok(())
-                },
-            );
-        }
+                pageserver::consumption_metrics::collect_metrics(
+                    metric_collection_endpoint,
+                    conf.metric_collection_interval,
+                    conf.cached_metric_collection_interval,
+                    conf.synthetic_size_calculation_interval,
+                    conf.id,
+                    metrics_ctx,
+                )
+                .instrument(info_span!("metrics_collection"))
+                .await?;
+                Ok(())
+            },
+        );
     }
 
     // Spawn a task to listen for libpq connections. It will spawn further tasks
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index fc8da70cc0..0a55741f84 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1128,8 +1128,6 @@ async fn disk_usage_eviction_run(
         freed_bytes: 0,
     };
 
-    use crate::task_mgr::MGMT_REQUEST_RUNTIME;
-
     let (tx, rx) = tokio::sync::oneshot::channel();
 
     let state = get_state(&r);
@@ -1147,7 +1145,7 @@ async fn disk_usage_eviction_run(
     let _g = cancel.drop_guard();
 
     crate::task_mgr::spawn(
-        MGMT_REQUEST_RUNTIME.handle(),
+        crate::task_mgr::BACKGROUND_RUNTIME.handle(),
         TaskKind::DiskUsageEviction,
         None,
         None,

From feff887c6f5418316870533435564adbaacf8195 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 5 Jul 2023 18:40:25 +0100
Subject: [PATCH 58/63] Compile `pg_embedding` extension (#4634)

```
CREATE EXTENSION embedding;
CREATE TABLE t (val real[]);
INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
INSERT INTO t (val) VALUES (array[1,2,4]);

SELECT * FROM t ORDER BY val <-> array[3,3,3];
   val
---------
 {1,2,3}
 {1,2,4}
 {1,1,1}
 {0,0,0}

(5 rows)
```
---
 Dockerfile.compute-node | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 310e4c32a3..7208024d63 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -515,6 +515,25 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
 
+#########################################################################################
+#
+# Layer "pg-embedding-pg-build"
+# compile pg_embedding extension
+#
+#########################################################################################
+FROM build-deps AS pg-embedding-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+# 2465f831ea1f8d49c1d74f8959adb7fc277d70cd made on 05/07/2023
+# There is no release tag yet
+RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1d74f8959adb7fc277d70cd.tar.gz -O pg_embedding.tar.gz && \
+    echo "047af2b1f664a1e6e37867bd4eeaf5934fa27d6ba3d6c4461efa388ddf7cd1d5 pg_embedding.tar.gz" | sha256sum --check && \
+    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/embedding.control
+
 #########################################################################################
 #
 # Layer "pg-anon-pg-build"
@@ -671,6 +690,7 @@ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \

From efa6aa134f4190d4ad9c10eaa55b17abd79b9457 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 7 Jul 2023 17:50:50 +0100
Subject: [PATCH 59/63] allow repeated IO errors from compute node (#4624)

## Problem

#4598 compute nodes are not accessible some time after wake up due to
kubernetes DNS not being fully propagated.

## Summary of changes

Update connect retry mechanism to support handling IO errors and
sleeping for 100ms

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
---
 proxy/src/http/conn_pool.rs | 69 +++++++++++++++++-----------
 proxy/src/proxy.rs          | 92 ++++++++++++++++++++++++++++++-------
 proxy/src/proxy/tests.rs    | 17 +++++++
 3 files changed, 136 insertions(+), 42 deletions(-)

diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs
index 52c1e2f2ce..27950d3a20 100644
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -2,16 +2,15 @@ use parking_lot::Mutex;
 use pq_proto::StartupMessageParams;
 use std::fmt;
 use std::{collections::HashMap, sync::Arc};
-
-use futures::TryFutureExt;
+use tokio::time;
 
 use crate::config;
 use crate::{auth, console};
 
 use super::sql_over_http::MAX_RESPONSE_SIZE;
 
-use crate::proxy::invalidate_cache;
-use crate::proxy::NUM_RETRIES_WAKE_COMPUTE;
+use crate::proxy::try_wake;
+use crate::proxy::{BASE_RETRY_WAIT_DURATION, NUM_RETRIES_WAKE_COMPUTE};
 
 use tracing::error;
 use tracing::info;
@@ -223,32 +222,59 @@ async fn connect_to_compute(
 
     // This code is a copy of `connect_to_compute` from `src/proxy.rs` with
     // the difference that it uses `tokio_postgres` for the connection.
-    let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
+    let mut num_retries = 0;
+    let mut should_wake = true;
     loop {
         match connect_to_compute_once(node_info, conn_info).await {
-            Err(e) if num_retries > 0 => {
-                info!("compute node's state has changed; requesting a wake-up");
-                match creds.wake_compute(&extra).await? {
-                    // Update `node_info` and try one more time.
-                    Some(new) => {
-                        *node_info = new;
+            Err(e) if num_retries == NUM_RETRIES_WAKE_COMPUTE => {
+                if let Some(wait_duration) = retry_connect_in(&e, num_retries) {
+                    error!(error = ?e, "could not connect to compute node");
+                    if should_wake {
+                        match try_wake(node_info, &extra, &creds).await {
+                            Ok(Some(x)) => should_wake = x,
+                            Ok(None) => return Err(e.into()),
+                            Err(e) => return Err(e.into()),
+                        }
                     }
-                    // Link auth doesn't work that way, so we just exit.
-                    None => return Err(e),
+                    if !wait_duration.is_zero() {
+                        time::sleep(wait_duration).await;
+                    }
+                } else {
+                    return Err(e.into());
                 }
             }
-            other => return other,
+            other => return Ok(other?),
         }
 
-        num_retries -= 1;
-        info!("retrying after wake-up ({num_retries} attempts left)");
+        num_retries += 1;
+        info!(retries_left = num_retries, "retrying connect");
+    }
+}
+
+fn retry_connect_in(err: &tokio_postgres::Error, num_retries: u32) -> Option<time::Duration> {
+    use tokio_postgres::error::SqlState;
+    match err.code() {
+        // retry all errors at least once immediately
+        _ if num_retries == 0 => Some(time::Duration::ZERO),
+        // keep retrying connection errors every 100ms
+        Some(
+            &SqlState::CONNECTION_FAILURE
+            | &SqlState::CONNECTION_EXCEPTION
+            | &SqlState::CONNECTION_DOES_NOT_EXIST
+            | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
+        ) => {
+            // 3/2 = 1.5 which seems to be an ok growth factor heuristic
+            Some(BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries))
+        }
+        // otherwise, don't retry
+        _ => None,
     }
 }
 
 async fn connect_to_compute_once(
     node_info: &console::CachedNodeInfo,
     conn_info: &ConnInfo,
-) -> anyhow::Result<tokio_postgres::Client> {
+) -> Result<tokio_postgres::Client, tokio_postgres::Error> {
     let mut config = (*node_info.config).clone();
 
     let (client, connection) = config
@@ -257,15 +283,6 @@ async fn connect_to_compute_once(
         .dbname(&conn_info.dbname)
         .max_backend_message_size(MAX_RESPONSE_SIZE)
         .connect(tokio_postgres::NoTls)
-        .inspect_err(|e: &tokio_postgres::Error| {
-            error!(
-                "failed to connect to compute node hosts={:?} ports={:?}: {}",
-                node_info.config.get_hosts(),
-                node_info.config.get_ports(),
-                e
-            );
-            invalidate_cache(node_info)
-        })
         .await?;
 
     tokio::spawn(async move {
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 2433412c4c..5c5353a63e 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -6,12 +6,17 @@ use crate::{
     cancellation::{self, CancelMap},
     compute::{self, PostgresConnection},
     config::{ProxyConfig, TlsConfig},
-    console::{self, messages::MetricsAuxInfo},
+    console::{
+        self,
+        errors::{ApiError, WakeComputeError},
+        messages::MetricsAuxInfo,
+    },
     error::io_error,
     stream::{PqStream, Stream},
 };
 use anyhow::{bail, Context};
 use futures::TryFutureExt;
+use hyper::StatusCode;
 use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
@@ -25,7 +30,9 @@ use tracing::{error, info, warn};
 use utils::measured_stream::MeasuredStream;
 
 /// Number of times we should retry the `/proxy_wake_compute` http request.
-pub const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
+/// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
+pub const NUM_RETRIES_WAKE_COMPUTE: u32 = 10;
+pub const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
 
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";
@@ -315,7 +322,6 @@ async fn connect_to_compute_once(
     node_info
         .config
         .connect(allow_self_signed_compute, timeout)
-        .inspect_err(|_: &compute::ConnectionError| invalidate_cache(node_info))
         .await
 }
 
@@ -328,7 +334,8 @@ async fn connect_to_compute(
     extra: &console::ConsoleReqExtra<'_>,
     creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
 ) -> Result<PostgresConnection, compute::ConnectionError> {
-    let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
+    let mut num_retries = 0;
+    let mut should_wake = true;
     loop {
         // Apply startup params to the (possibly, cached) compute node info.
         node_info.config.set_startup_params(params);
@@ -346,30 +353,83 @@ async fn connect_to_compute(
         // We only use caching in case of scram proxy backed by the console, so reduce
         // the timeout only in that case.
         let is_scram_proxy = matches!(creds, auth::BackendType::Console(_, _));
-        let timeout = if is_scram_proxy && num_retries == NUM_RETRIES_WAKE_COMPUTE {
+        let timeout = if is_scram_proxy && num_retries == 0 {
             time::Duration::from_secs(2)
         } else {
             time::Duration::from_secs(10)
         };
 
         match connect_to_compute_once(node_info, timeout).await {
-            Err(e) if num_retries > 0 => {
-                info!("compute node's state has changed; requesting a wake-up");
-                match creds.wake_compute(extra).map_err(io_error).await? {
-                    // Update `node_info` and try one more time.
-                    Some(mut new) => {
-                        new.config.reuse_password(&node_info.config);
-                        *node_info = new;
+            Err(e) if num_retries < NUM_RETRIES_WAKE_COMPUTE => {
+                if let Some(wait_duration) = retry_connect_in(&e, num_retries) {
+                    error!(error = ?e, "could not connect to compute node");
+                    if should_wake {
+                        match try_wake(node_info, extra, creds).await {
+                            Ok(Some(x)) => {
+                                should_wake = x;
+                            }
+                            Ok(None) => return Err(e),
+                            Err(e) => return Err(io_error(e).into()),
+                        }
                     }
-                    // Link auth doesn't work that way, so we just exit.
-                    None => return Err(e),
+                    if !wait_duration.is_zero() {
+                        time::sleep(wait_duration).await;
+                    }
+                } else {
+                    return Err(e);
                 }
             }
             other => return other,
         }
 
-        num_retries -= 1;
-        info!("retrying after wake-up ({num_retries} attempts left)");
+        num_retries += 1;
+        info!(retries_left = num_retries, "retrying connect");
+    }
+}
+
+/// Attempts to wake up the compute node.
+/// * Returns Ok(Some(true)) if there was an error waking but retries are acceptable
+/// * Returns Ok(Some(false)) if the wakeup succeeded
+/// * Returns Ok(None) or Err(e) if there was an error
+pub async fn try_wake(
+    node_info: &mut console::CachedNodeInfo,
+    extra: &console::ConsoleReqExtra<'_>,
+    creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
+) -> Result<Option<bool>, WakeComputeError> {
+    info!("compute node's state has likely changed; requesting a wake-up");
+    invalidate_cache(node_info);
+    match creds.wake_compute(extra).await {
+        // retry wake if the compute was in an invalid state
+        Err(WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::BAD_REQUEST,
+            ..
+        })) => Ok(Some(true)),
+        // Update `node_info` and try again.
+        Ok(Some(mut new)) => {
+            new.config.reuse_password(&node_info.config);
+            *node_info = new;
+            Ok(Some(false))
+        }
+        Err(e) => Err(e),
+        Ok(None) => Ok(None),
+    }
+}
+
+fn retry_connect_in(err: &compute::ConnectionError, num_retries: u32) -> Option<time::Duration> {
+    use std::io::ErrorKind;
+    match err {
+        // retry all errors at least once immediately
+        _ if num_retries == 0 => Some(time::Duration::ZERO),
+        // keep retrying connection errors every 100ms
+        compute::ConnectionError::CouldNotConnect(io_err) => match io_err.kind() {
+            ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable => {
+                // 3/2 = 1.5 which seems to be an ok growth factor heuristic
+                Some(BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries))
+            }
+            _ => None,
+        },
+        // otherwise, don't retry
+        _ => None,
     }
 }
 
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 3373c49676..a1f6cd3ed4 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -1,4 +1,6 @@
 //! A group of high-level tests for connection establishing logic and auth.
+use std::io;
+
 use super::*;
 use crate::{auth, sasl, scram};
 use async_trait::async_trait;
@@ -294,3 +296,18 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 
     Ok(())
 }
+
+#[test]
+fn connect_compute_total_wait() {
+    let err = compute::ConnectionError::CouldNotConnect(io::Error::new(
+        io::ErrorKind::ConnectionRefused,
+        "conn refused",
+    ));
+
+    let mut total_wait = tokio::time::Duration::ZERO;
+    for num_retries in 0..10 {
+        total_wait += retry_connect_in(&err, num_retries).unwrap();
+    }
+    assert!(total_wait < tokio::time::Duration::from_secs(12));
+    assert!(total_wait > tokio::time::Duration::from_secs(10));
+}

From 39a28d11083eefc72c5f302c21e0e74742129e14 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 12 Jul 2023 11:38:36 +0100
Subject: [PATCH 60/63] proxy wake_compute loop (#4675)

## Problem

If we fail to wake up the compute node, a subsequent connect attempt
will definitely fail. However, kubernetes won't fail the connection
immediately, instead it hangs until we timeout (10s).

## Summary of changes

Refactor the loop to allow fast retries of compute_wake and to skip a
connect attempt.
---
 proxy/src/http/conn_pool.rs |  81 ++++++++++++++++-----------
 proxy/src/proxy.rs          | 108 ++++++++++++++++++++++++------------
 proxy/src/proxy/tests.rs    |   9 +--
 3 files changed, 121 insertions(+), 77 deletions(-)

diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs
index 27950d3a20..fb53c663c8 100644
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -1,6 +1,7 @@
 use parking_lot::Mutex;
 use pq_proto::StartupMessageParams;
 use std::fmt;
+use std::ops::ControlFlow;
 use std::{collections::HashMap, sync::Arc};
 use tokio::time;
 
@@ -9,8 +10,7 @@ use crate::{auth, console};
 
 use super::sql_over_http::MAX_RESPONSE_SIZE;
 
-use crate::proxy::try_wake;
-use crate::proxy::{BASE_RETRY_WAIT_DURATION, NUM_RETRIES_WAKE_COMPUTE};
+use crate::proxy::{invalidate_cache, retry_after, try_wake, NUM_RETRIES_WAKE_COMPUTE};
 
 use tracing::error;
 use tracing::info;
@@ -184,11 +184,10 @@ impl GlobalConnPool {
     }
 }
 
-//
 // Wake up the destination if needed. Code here is a bit involved because
 // we reuse the code from the usual proxy and we need to prepare few structures
 // that this code expects.
-//
+#[tracing::instrument(skip_all)]
 async fn connect_to_compute(
     config: &config::ProxyConfig,
     conn_info: &ConnInfo,
@@ -220,54 +219,72 @@ async fn connect_to_compute(
 
     let node_info = &mut creds.wake_compute(&extra).await?.expect("msg");
 
-    // This code is a copy of `connect_to_compute` from `src/proxy.rs` with
-    // the difference that it uses `tokio_postgres` for the connection.
     let mut num_retries = 0;
-    let mut should_wake = true;
+    let mut wait_duration = time::Duration::ZERO;
+    let mut should_wake_with_error = None;
     loop {
+        if !wait_duration.is_zero() {
+            time::sleep(wait_duration).await;
+        }
+
+        // try wake the compute node if we have determined it's sensible to do so
+        if let Some(err) = should_wake_with_error.take() {
+            match try_wake(node_info, &extra, &creds).await {
+                // we can't wake up the compute node
+                Ok(None) => return Err(err),
+                // there was an error communicating with the control plane
+                Err(e) => return Err(e.into()),
+                // failed to wake up but we can continue to retry
+                Ok(Some(ControlFlow::Continue(()))) => {
+                    wait_duration = retry_after(num_retries);
+                    should_wake_with_error = Some(err);
+
+                    num_retries += 1;
+                    info!(num_retries, "retrying wake compute");
+                    continue;
+                }
+                // successfully woke up a compute node and can break the wakeup loop
+                Ok(Some(ControlFlow::Break(()))) => {}
+            }
+        }
+
         match connect_to_compute_once(node_info, conn_info).await {
-            Err(e) if num_retries == NUM_RETRIES_WAKE_COMPUTE => {
-                if let Some(wait_duration) = retry_connect_in(&e, num_retries) {
-                    error!(error = ?e, "could not connect to compute node");
-                    if should_wake {
-                        match try_wake(node_info, &extra, &creds).await {
-                            Ok(Some(x)) => should_wake = x,
-                            Ok(None) => return Err(e.into()),
-                            Err(e) => return Err(e.into()),
-                        }
-                    }
-                    if !wait_duration.is_zero() {
-                        time::sleep(wait_duration).await;
-                    }
-                } else {
+            Ok(res) => return Ok(res),
+            Err(e) => {
+                error!(error = ?e, "could not connect to compute node");
+                if !can_retry_error(&e, num_retries) {
                     return Err(e.into());
                 }
+                wait_duration = retry_after(num_retries);
+
+                // after the first connect failure,
+                // we should invalidate the cache and wake up a new compute node
+                if num_retries == 0 {
+                    invalidate_cache(node_info);
+                    should_wake_with_error = Some(e.into());
+                }
             }
-            other => return Ok(other?),
         }
 
         num_retries += 1;
-        info!(retries_left = num_retries, "retrying connect");
+        info!(num_retries, "retrying connect");
     }
 }
 
-fn retry_connect_in(err: &tokio_postgres::Error, num_retries: u32) -> Option<time::Duration> {
+fn can_retry_error(err: &tokio_postgres::Error, num_retries: u32) -> bool {
     use tokio_postgres::error::SqlState;
     match err.code() {
-        // retry all errors at least once immediately
-        _ if num_retries == 0 => Some(time::Duration::ZERO),
-        // keep retrying connection errors every 100ms
+        // retry all errors at least once
+        _ if num_retries == 0 => true,
+        // keep retrying connection errors
         Some(
             &SqlState::CONNECTION_FAILURE
             | &SqlState::CONNECTION_EXCEPTION
             | &SqlState::CONNECTION_DOES_NOT_EXIST
             | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
-        ) => {
-            // 3/2 = 1.5 which seems to be an ok growth factor heuristic
-            Some(BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries))
-        }
+        ) if num_retries < NUM_RETRIES_WAKE_COMPUTE => true,
         // otherwise, don't retry
-        _ => None,
+        _ => false,
     }
 }
 
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 5c5353a63e..12ca9c5187 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -20,7 +20,7 @@ use hyper::StatusCode;
 use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
-use std::sync::Arc;
+use std::{ops::ControlFlow, sync::Arc};
 use tokio::{
     io::{AsyncRead, AsyncWrite, AsyncWriteExt},
     time,
@@ -32,7 +32,7 @@ use utils::measured_stream::MeasuredStream;
 /// Number of times we should retry the `/proxy_wake_compute` http request.
 /// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
 pub const NUM_RETRIES_WAKE_COMPUTE: u32 = 10;
-pub const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
+const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
 
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";
@@ -335,11 +335,37 @@ async fn connect_to_compute(
     creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
 ) -> Result<PostgresConnection, compute::ConnectionError> {
     let mut num_retries = 0;
-    let mut should_wake = true;
+    let mut wait_duration = time::Duration::ZERO;
+    let mut should_wake_with_error = None;
     loop {
         // Apply startup params to the (possibly, cached) compute node info.
         node_info.config.set_startup_params(params);
 
+        if !wait_duration.is_zero() {
+            time::sleep(wait_duration).await;
+        }
+
+        // try wake the compute node if we have determined it's sensible to do so
+        if let Some(err) = should_wake_with_error.take() {
+            match try_wake(node_info, extra, creds).await {
+                // we can't wake up the compute node
+                Ok(None) => return Err(err),
+                // there was an error communicating with the control plane
+                Err(e) => return Err(io_error(e).into()),
+                // failed to wake up but we can continue to retry
+                Ok(Some(ControlFlow::Continue(()))) => {
+                    wait_duration = retry_after(num_retries);
+                    should_wake_with_error = Some(err);
+
+                    num_retries += 1;
+                    info!(num_retries, "retrying wake compute");
+                    continue;
+                }
+                // successfully woke up a compute node and can break the wakeup loop
+                Ok(Some(ControlFlow::Break(()))) => {}
+            }
+        }
+
         // Set a shorter timeout for the initial connection attempt.
         //
         // In case we try to connect to an outdated address that is no longer valid, the
@@ -359,31 +385,29 @@ async fn connect_to_compute(
             time::Duration::from_secs(10)
         };
 
+        // do this again to ensure we have username?
+        node_info.config.set_startup_params(params);
+
         match connect_to_compute_once(node_info, timeout).await {
-            Err(e) if num_retries < NUM_RETRIES_WAKE_COMPUTE => {
-                if let Some(wait_duration) = retry_connect_in(&e, num_retries) {
-                    error!(error = ?e, "could not connect to compute node");
-                    if should_wake {
-                        match try_wake(node_info, extra, creds).await {
-                            Ok(Some(x)) => {
-                                should_wake = x;
-                            }
-                            Ok(None) => return Err(e),
-                            Err(e) => return Err(io_error(e).into()),
-                        }
-                    }
-                    if !wait_duration.is_zero() {
-                        time::sleep(wait_duration).await;
-                    }
-                } else {
+            Ok(res) => return Ok(res),
+            Err(e) => {
+                error!(error = ?e, "could not connect to compute node");
+                if !can_retry_error(&e, num_retries) {
                     return Err(e);
                 }
+                wait_duration = retry_after(num_retries);
+
+                // after the first connect failure,
+                // we should invalidate the cache and wake up a new compute node
+                if num_retries == 0 {
+                    invalidate_cache(node_info);
+                    should_wake_with_error = Some(e);
+                }
             }
-            other => return other,
         }
 
         num_retries += 1;
-        info!(retries_left = num_retries, "retrying connect");
+        info!(num_retries, "retrying connect");
     }
 }
 
@@ -395,41 +419,51 @@ pub async fn try_wake(
     node_info: &mut console::CachedNodeInfo,
     extra: &console::ConsoleReqExtra<'_>,
     creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
-) -> Result<Option<bool>, WakeComputeError> {
+) -> Result<Option<ControlFlow<()>>, WakeComputeError> {
     info!("compute node's state has likely changed; requesting a wake-up");
-    invalidate_cache(node_info);
     match creds.wake_compute(extra).await {
         // retry wake if the compute was in an invalid state
         Err(WakeComputeError::ApiError(ApiError::Console {
             status: StatusCode::BAD_REQUEST,
             ..
-        })) => Ok(Some(true)),
+        })) => Ok(Some(ControlFlow::Continue(()))),
         // Update `node_info` and try again.
         Ok(Some(mut new)) => {
             new.config.reuse_password(&node_info.config);
             *node_info = new;
-            Ok(Some(false))
+            Ok(Some(ControlFlow::Break(())))
         }
         Err(e) => Err(e),
         Ok(None) => Ok(None),
     }
 }
 
-fn retry_connect_in(err: &compute::ConnectionError, num_retries: u32) -> Option<time::Duration> {
+fn can_retry_error(err: &compute::ConnectionError, num_retries: u32) -> bool {
     use std::io::ErrorKind;
     match err {
-        // retry all errors at least once immediately
-        _ if num_retries == 0 => Some(time::Duration::ZERO),
-        // keep retrying connection errors every 100ms
-        compute::ConnectionError::CouldNotConnect(io_err) => match io_err.kind() {
-            ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable => {
-                // 3/2 = 1.5 which seems to be an ok growth factor heuristic
-                Some(BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries))
-            }
-            _ => None,
-        },
+        // retry all errors at least once
+        _ if num_retries == 0 => true,
+        // keep retrying connection errors
+        compute::ConnectionError::CouldNotConnect(io_err)
+            if num_retries < NUM_RETRIES_WAKE_COMPUTE =>
+        {
+            matches!(
+                io_err.kind(),
+                ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable
+            )
+        }
         // otherwise, don't retry
-        _ => None,
+        _ => false,
+    }
+}
+
+pub fn retry_after(num_retries: u32) -> time::Duration {
+    match num_retries {
+        0 => time::Duration::ZERO,
+        _ => {
+            // 3/2 = 1.5 which seems to be an ok growth factor heuristic
+            BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries)
+        }
     }
 }
 
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index a1f6cd3ed4..b9215cd90e 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -1,6 +1,4 @@
 //! A group of high-level tests for connection establishing logic and auth.
-use std::io;
-
 use super::*;
 use crate::{auth, sasl, scram};
 use async_trait::async_trait;
@@ -299,14 +297,9 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 
 #[test]
 fn connect_compute_total_wait() {
-    let err = compute::ConnectionError::CouldNotConnect(io::Error::new(
-        io::ErrorKind::ConnectionRefused,
-        "conn refused",
-    ));
-
     let mut total_wait = tokio::time::Duration::ZERO;
     for num_retries in 0..10 {
-        total_wait += retry_connect_in(&err, num_retries).unwrap();
+        total_wait += retry_after(num_retries);
     }
     assert!(total_wait < tokio::time::Duration::from_secs(12));
     assert!(total_wait > tokio::time::Duration::from_secs(10));

From 4204960942fc456906396072c63b373f600b89e5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <me@cschwarz.com>
Date: Tue, 1 Aug 2023 11:52:35 +0200
Subject: [PATCH 61/63] ci: fix upload-postgres-extensions-to-s3 job

commit

	commit 5f8fd640bfa8e5d4d23dbc3df1b0a521ec666e56
	Author: Alek Westover <alek.westover@gmail.com>
	Date:   Wed Jul 26 08:24:03 2023 -0400

	    Upload Test Remote Extensions (#4792)

switched to using the release tag instead of `latest`, but,
the `promote-images` job only uploads `latest` to the prod ECR.

The switch to using release tag was good in principle, but,
reverting that part to make the release pipeine work.

Note that a proper fix should abandon use of `:latest` tag
at all: currently, if a `main` pipeline runs concurrently
with a `release` pipeline, the `release` pipeline may end
up using the `main` pipeline's images.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 27bad61639..dcdc388a93 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -955,7 +955,7 @@ jobs:
         version: [ v14, v15 ]
 
     env:
-      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
       AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
       AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
       S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}

From 6eae4fc9aa9d143f6f2000fa2d761ccccf079710 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 3 Aug 2023 07:48:09 +0100
Subject: [PATCH 62/63] Release 2023-08-02: update pg_embedding  (#4877)

Cherry-picking ca4d71a9549be5638d84e6a9ff5c66ca14cbb05d from `main` into
the `release`

Co-authored-by: Vadim Kharitonov <vadim2404@users.noreply.github.com>
---
 Dockerfile.compute-node | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 9759faf733..b2d096dc43 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -551,8 +551,8 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.1.tar.gz -O pg_embedding.tar.gz && \
-    echo "c4ae84eef36fa8ec5868f6e061f39812f19ee5ba3604d428d40935685c7be512 pg_embedding.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
+    echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
     mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \

From b1ddd01289f1092781d5603a88d0e75a9c23bc2b Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Thu, 3 Aug 2023 15:28:31 +0200
Subject: [PATCH 63/63] Define NEON_SMGR to make it possible for extensions to
 use Neon SMG API (#4889)

Co-authored-by: Konstantin Knizhnik <knizhnik@garret.ru>
Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/revisions.json | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index ebedb34d01..da3885c34d 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit ebedb34d01c8ac9c31e8ea4628b9854103a1dc8f
+Subproject commit da3885c34db312afd555802be2ce985fafd1d8ad
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 1220c8a63f..770c6dffc5 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 1220c8a63f00101829f9222a5821fc084b4384c7
+Subproject commit 770c6dffc5ef6aac05bf049693877fb377eea6fc
diff --git a/vendor/revisions.json b/vendor/revisions.json
index f5d7428cd9..8579b5abaa 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,4 +1,4 @@
 {
-    "postgres-v15": "1220c8a63f00101829f9222a5821fc084b4384c7",
-    "postgres-v14": "ebedb34d01c8ac9c31e8ea4628b9854103a1dc8f"
+    "postgres-v15": "770c6dffc5ef6aac05bf049693877fb377eea6fc",
+    "postgres-v14": "da3885c34db312afd555802be2ce985fafd1d8ad"
 }