From df7a9d1407cf189a86dc54c8de7b7de4936dd842 Mon Sep 17 00:00:00 2001
From: Andrey Taranik <andrey@cicd.team>
Date: Thu, 17 Mar 2022 00:43:28 +0300
Subject: [PATCH 001/209] release fix 2022-03-16 (#1375)

---
 .circleci/ansible/deploy.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/ansible/deploy.yaml b/.circleci/ansible/deploy.yaml
index c95524a8a5..2dd109f99a 100644
--- a/.circleci/ansible/deploy.yaml
+++ b/.circleci/ansible/deploy.yaml
@@ -119,7 +119,7 @@
       shell:
         cmd: |
           INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
+          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
       tags:
       - pageserver
 
@@ -169,6 +169,6 @@
       shell:
         cmd: |
           INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -d '{"version": {{ current_version }} }' -X POST {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
+          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
       tags:
       - safekeeper

From 73f247d537ebea4719461e0d0d3a8c5c92e45bb0 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 31 May 2022 16:00:50 +0400
Subject: [PATCH 002/209] Bump vendor/postgres to hotfix basebackup LSN
 comparison.

---
 vendor/postgres | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/postgres b/vendor/postgres
index 038b2b98e5..658157375a 160000
--- a/vendor/postgres
+++ b/vendor/postgres
@@ -1 +1 @@
-Subproject commit 038b2b98e5c3d6274cbd43e9b822cdd946cb8b91
+Subproject commit 658157375a2b1b574766c1a055dde224c269a2f8

From cf350c6002e1107f8b800b5cc4e4f273ea432c5f Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 31 May 2022 17:36:35 +0200
Subject: [PATCH 003/209] Use :local compute-tools tag to build compute-node
 image

---
 .circleci/config.yml | 24 ++++++++++++++----------
 vendor/postgres      |  2 +-
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 5346e35c01..3377b907cb 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -462,9 +462,6 @@ jobs:
       - checkout
       - setup_remote_docker:
           docker_layer_caching: true
-      # Build neondatabase/compute-tools:latest image and push it to Docker hub
-      # TODO: this should probably also use versioned tag, not just :latest.
-      # XXX: but should it? We build and use it only locally now.
       - run:
           name: Build and push compute-tools Docker image
           command: |
@@ -472,7 +469,10 @@ jobs:
             docker build \
               --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
               --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
-              --tag neondatabase/compute-tools:latest -f Dockerfile.compute-tools .
+              --tag neondatabase/compute-tools:local \
+              --tag neondatabase/compute-tools:latest \
+              -f Dockerfile.compute-tools .
+            # Only push :latest image
             docker push neondatabase/compute-tools:latest
       - run:
           name: Init postgres submodule
@@ -482,7 +482,9 @@ jobs:
           command: |
             echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
             DOCKER_TAG=$(git log --oneline|wc -l)
-            docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:latest vendor/postgres
+            docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
+              --tag neondatabase/compute-node:latest vendor/postgres \
+              --build-arg COMPUTE_TOOLS_TAG=local
             docker push neondatabase/compute-node:${DOCKER_TAG}
             docker push neondatabase/compute-node:latest
 
@@ -519,9 +521,6 @@ jobs:
       - checkout
       - setup_remote_docker:
           docker_layer_caching: true
-      # Build neondatabase/compute-tools:release image and push it to Docker hub
-      # TODO: this should probably also use versioned tag, not just :latest.
-      # XXX: but should it? We build and use it only locally now.
       - run:
           name: Build and push compute-tools Docker image
           command: |
@@ -529,7 +528,10 @@ jobs:
             docker build \
               --build-arg AWS_ACCESS_KEY_ID="${CACHEPOT_AWS_ACCESS_KEY_ID}" \
               --build-arg AWS_SECRET_ACCESS_KEY="${CACHEPOT_AWS_SECRET_ACCESS_KEY}" \
-              --tag neondatabase/compute-tools:release -f Dockerfile.compute-tools .
+              --tag neondatabase/compute-tools:release \
+              --tag neondatabase/compute-tools:local \
+              -f Dockerfile.compute-tools .
+            # Only push :release image
             docker push neondatabase/compute-tools:release
       - run:
           name: Init postgres submodule
@@ -539,7 +541,9 @@ jobs:
           command: |
             echo $NEON_DOCKER_PWD | docker login -u $NEON_DOCKER_LOGIN --password-stdin
             DOCKER_TAG="release-$(git log --oneline|wc -l)"
-            docker build --tag neondatabase/compute-node:${DOCKER_TAG} --tag neondatabase/compute-node:release vendor/postgres
+            docker build --tag neondatabase/compute-node:${DOCKER_TAG} \
+              --tag neondatabase/compute-node:release vendor/postgres \
+              --build-arg COMPUTE_TOOLS_TAG=local
             docker push neondatabase/compute-node:${DOCKER_TAG}
             docker push neondatabase/compute-node:release
 
diff --git a/vendor/postgres b/vendor/postgres
index 658157375a..dba273190e 160000
--- a/vendor/postgres
+++ b/vendor/postgres
@@ -1 +1 @@
-Subproject commit 658157375a2b1b574766c1a055dde224c269a2f8
+Subproject commit dba273190e546c2a6345c38435e91780797c734f

From cc856eca85eb6fb586537583b215b790d34cdb7d Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 31 May 2022 18:35:06 +0200
Subject: [PATCH 004/209] Install missing openssl packages in the Github
 Actions workflow

---
 .github/workflows/testing.yml | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 79b2ba05d0..ad7bddfabc 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -34,11 +34,11 @@ jobs:
         if: matrix.os == 'ubuntu-latest'
         run: |
           sudo apt update
-          sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev
+          sudo apt install build-essential libreadline-dev zlib1g-dev flex bison libseccomp-dev libssl-dev
 
-      - name: Install macOs postgres dependencies
+      - name: Install macOS postgres dependencies
         if: matrix.os == 'macos-latest'
-        run: brew install flex bison
+        run: brew install flex bison openssl
 
       - name: Set pg revision for caching
         id: pg_ver
@@ -52,10 +52,27 @@ jobs:
             tmp_install/
           key: ${{ runner.os }}-pg-${{ steps.pg_ver.outputs.pg_rev }}
 
+      - name: Set extra env for macOS
+        if: matrix.os == 'macos-latest'
+        run: |
+          echo 'LDFLAGS=-L/usr/local/opt/openssl@3/lib' >> $GITHUB_ENV
+          echo 'CPPFLAGS=-I/usr/local/opt/openssl@3/include' >> $GITHUB_ENV
+
       - name: Build postgres
         if: steps.cache_pg.outputs.cache-hit != 'true'
         run: make postgres
 
+      # Plain configure output can contain weird errors like 'error: C compiler cannot create executables'
+      # and the real cause will be inside config.log
+      - name: Print configure logs in case of failure
+        if: failure()
+        continue-on-error: true
+        run: |
+          echo '' && echo '=== config.log ===' && echo ''
+          cat tmp_install/build/config.log
+          echo '' && echo '=== configure.log ===' && echo ''
+          cat tmp_install/build/configure.log
+
       - name: Cache cargo deps
         id: cache_cargo
         uses: actions/cache@v2

From 93467eae1f1b6dd0bb66bc0a2869b3f6e3f6afe5 Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Sat, 22 Oct 2022 02:26:28 +0300
Subject: [PATCH 005/209] Hotfix to disable grant create on public schema

`GRANT CREATE ON SCHEMA public` fails if there is no schema `public`.
Disable it in release for now and make a better fix later (it is
needed for v15 support).
---
 compute_tools/src/spec.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index e0c0e9404b..1e7cd51b6e 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -423,11 +423,11 @@ pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
         );
         db_client.simple_query(&alter_query)?;
 
-        // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
-        // This is needed since postgres 15, where this privilege is removed by default.
-        let grant_query: String = "GRANT CREATE ON SCHEMA public TO web_access".to_string();
-        info!("grant query for db {} : {}", &db.name, &grant_query);
-        db_client.simple_query(&grant_query)?;
+        // // Explicitly grant CREATE ON SCHEMA PUBLIC to the web_access user.
+        // // This is needed since postgres 15, where this privilege is removed by default.
+        // let grant_query: String = "GRANT CREATE ON SCHEMA public TO web_access".to_string();
+        // info!("grant query for db {} : {}", &db.name, &grant_query);
+        // db_client.simple_query(&grant_query)?;
     }
 
     Ok(())

From 323c4ecb4fc67d3ca63de9800fcafa00dfde9a91 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 25 Oct 2022 16:41:50 +0200
Subject: [PATCH 006/209] Add data format backward compatibility tests (#2626)

---
 .../actions/run-python-test-set/action.yml    |  18 ++
 .github/workflows/build_and_test.yml          |  23 +-
 poetry.lock                                   |  52 +++-
 pyproject.toml                                |   2 +
 test_runner/fixtures/neon_fixtures.py         |   2 +-
 test_runner/regress/test_compatibility.py     | 267 ++++++++++++++++++
 6 files changed, 352 insertions(+), 12 deletions(-)
 create mode 100644 test_runner/regress/test_compatibility.py

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index cc6ab65b76..07cb7edbe7 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -73,6 +73,13 @@ runs:
       shell: bash -euxo pipefail {0}
       run: ./scripts/pysync
 
+    - name: Download compatibility snapshot for Postgres 14
+      uses: ./.github/actions/download
+      with:
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg14
+        path: /tmp/compatibility_snapshot_pg14
+        prefix: latest
+
     - name: Run pytest
       env:
         NEON_BIN: /tmp/neon/bin
@@ -80,6 +87,8 @@ runs:
         BUILD_TYPE: ${{ inputs.build_type }}
         AWS_ACCESS_KEY_ID: ${{ inputs.real_s3_access_key_id }}
         AWS_SECRET_ACCESS_KEY: ${{ inputs.real_s3_secret_access_key }}
+        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg14
+        ALLOW_BREAKING_CHANGES: contains(github.event.pull_request.labels.*.name, 'breaking changes')
       shell: bash -euxo pipefail {0}
       run: |
         # PLATFORM will be embedded in the perf test report
@@ -154,6 +163,15 @@ runs:
           scripts/generate_and_push_perf_report.sh
         fi
 
+    - name: Upload compatibility snapshot for Postgres 14
+      if: github.ref_name == 'release'
+      uses: ./.github/actions/upload
+      with:
+        name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
+        # The path includes a test name (test_prepare_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
+        path: /tmp/test_output/test_prepare_snapshot/compatibility_snapshot_pg14/
+        prefix: latest
+
     - name: Create Allure report
       if: always()
       uses: ./.github/actions/allure-report
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 14ee61c5b9..660f93b025 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -844,7 +844,7 @@ jobs:
           submodules: true
           fetch-depth: 0
 
-      - name: Configure environment 
+      - name: Configure environment
         run: |
           helm repo add neondatabase https://neondatabase.github.io/helm-charts
           aws --region us-east-2 eks update-kubeconfig --name dev-us-east-2-beta --role-arn arn:aws:iam::369495373322:role/github-runner
@@ -853,3 +853,24 @@ jobs:
         run: |
           DOCKER_TAG=${{needs.tag.outputs.build-tag}}
           helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install -f .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --wait --timeout 15m0s
+
+  promote-compatibility-test-snapshot:
+    runs-on: dev
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+    needs: [ deploy, deploy-proxy ]
+    if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
+    steps:
+      - name: Promote compatibility snapshot for the release
+        shell: bash -euxo pipefail {0}
+        env:
+          BUCKET: neon-github-public-dev
+          PREFIX: artifacts/latest
+        run: |
+          for build_type in debug release; do
+            OLD_FILENAME=compatibility-snapshot-${build_type}-pg14-${GITHUB_RUN_ID}.tar.zst
+            NEW_FILENAME=compatibility-snapshot-${build_type}-pg14.tar.zst
+
+            time aws s3 mv --only-show-errors s3://${BUCKET}/${PREFIX}/${OLD_FILENAME} s3://${BUCKET}/${PREFIX}/${NEW_FILENAME}
+          done
diff --git a/poetry.lock b/poetry.lock
index 27de8508ce..dfcb16107f 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -11,7 +11,7 @@ async-timeout = ">=3.0,<5.0"
 psycopg2-binary = ">=2.8.4"
 
 [package.extras]
-sa = ["sqlalchemy[postgresql_psycopg2binary] (>=1.3,<1.5)"]
+sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
 
 [[package]]
 name = "allure-pytest"
@@ -80,7 +80,7 @@ python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"]
 docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"]
 tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"]
-tests_no_zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
+tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
 
 [[package]]
 name = "aws-sam-translator"
@@ -560,7 +560,7 @@ optional = false
 python-versions = ">=3.6.0"
 
 [package.extras]
-unicode_backport = ["unicodedata2"]
+unicode-backport = ["unicodedata2"]
 
 [[package]]
 name = "click"
@@ -593,7 +593,7 @@ python-versions = ">=3.6"
 cffi = ">=1.12"
 
 [package.extras]
-docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx-rtd-theme"]
+docs = ["sphinx (>=1.6.5,!=1.8.0,!=3.1.0,!=3.1.1)", "sphinx_rtd_theme"]
 docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"]
 pep8test = ["black", "flake8", "flake8-import-order", "pep8-naming"]
 sdist = ["setuptools_rust (>=0.11.4)"]
@@ -738,9 +738,9 @@ python-versions = ">=3.6.1,<4.0"
 
 [package.extras]
 colors = ["colorama (>=0.4.3,<0.5.0)"]
-pipfile_deprecated_finder = ["pipreqs", "requirementslib"]
+pipfile-deprecated-finder = ["pipreqs", "requirementslib"]
 plugins = ["setuptools"]
-requirements_deprecated_finder = ["pip-api", "pipreqs"]
+requirements-deprecated-finder = ["pip-api", "pipreqs"]
 
 [[package]]
 name = "itsdangerous"
@@ -815,7 +815,7 @@ python-versions = ">=2.7"
 [package.extras]
 docs = ["jaraco.packaging (>=3.2)", "rst.linker (>=1.9)", "sphinx"]
 testing = ["ecdsa", "enum34", "feedparser", "jsonlib", "numpy", "pandas", "pymongo", "pytest (>=3.5,!=3.7.3)", "pytest-black-multipy", "pytest-checkdocs (>=1.2.3)", "pytest-cov", "pytest-flake8 (<1.1.0)", "pytest-flake8 (>=1.1.1)", "scikit-learn", "sqlalchemy"]
-"testing.libs" = ["simplejson", "ujson", "yajl"]
+testing-libs = ["simplejson", "ujson", "yajl"]
 
 [[package]]
 name = "jsonpointer"
@@ -836,11 +836,12 @@ python-versions = "*"
 [package.dependencies]
 attrs = ">=17.4.0"
 pyrsistent = ">=0.14.0"
+setuptools = "*"
 six = ">=1.11.0"
 
 [package.extras]
 format = ["idna", "jsonpointer (>1.13)", "rfc3987", "strict-rfc3339", "webcolors"]
-format_nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
+format-nongpl = ["idna", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "webcolors"]
 
 [[package]]
 name = "junit-xml"
@@ -900,6 +901,7 @@ pytz = "*"
 PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""}
 requests = ">=2.5"
 responses = ">=0.9.0"
+setuptools = {version = "*", optional = true, markers = "extra == \"server\""}
 sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""}
 werkzeug = ">=0.5,<2.2.0"
 xmltodict = "*"
@@ -1008,6 +1010,7 @@ python-versions = ">=3.7.0,<4.0.0"
 jsonschema = ">=3.2.0,<5.0.0"
 openapi-schema-validator = ">=0.2.0,<0.3.0"
 PyYAML = ">=5.1"
+setuptools = "*"
 
 [package.extras]
 requests = ["requests"]
@@ -1340,7 +1343,7 @@ urllib3 = ">=1.21.1,<1.27"
 
 [package.extras]
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
-use_chardet_on_py3 = ["chardet (>=3.0.2,<6)"]
+use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
 [[package]]
 name = "responses"
@@ -1394,6 +1397,19 @@ python-versions = ">= 2.7"
 attrs = "*"
 pbr = "*"
 
+[[package]]
+name = "setuptools"
+version = "65.5.0"
+description = "Easily download, build, install, upgrade, and uninstall Python packages"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.extras]
+docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"]
+testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "mock", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"]
+testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"]
+
 [[package]]
 name = "six"
 version = "1.16.0"
@@ -1460,6 +1476,14 @@ category = "main"
 optional = false
 python-versions = ">=3.7,<4.0"
 
+[[package]]
+name = "types-toml"
+version = "0.10.8"
+description = "Typing stubs for toml"
+category = "dev"
+optional = false
+python-versions = "*"
+
 [[package]]
 name = "types-urllib3"
 version = "1.26.17"
@@ -1544,7 +1568,7 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "ead1495454ee6d880bb240447025db93a25ebe263c2709de5f144cc2d85dc975"
+content-hash = "17cdbfe90f1b06dffaf24c3e076384ec08dd4a2dce5a05e50565f7364932eb2d"
 
 [metadata.files]
 aiopg = [
@@ -2182,6 +2206,10 @@ sarif-om = [
     {file = "sarif_om-1.0.4-py3-none-any.whl", hash = "sha256:539ef47a662329b1c8502388ad92457425e95dc0aaaf995fe46f4984c4771911"},
     {file = "sarif_om-1.0.4.tar.gz", hash = "sha256:cd5f416b3083e00d402a92e449a7ff67af46f11241073eea0461802a3b5aef98"},
 ]
+setuptools = [
+    {file = "setuptools-65.5.0-py3-none-any.whl", hash = "sha256:f62ea9da9ed6289bfe868cd6845968a2c854d1427f8548d52cae02a42b4f0356"},
+    {file = "setuptools-65.5.0.tar.gz", hash = "sha256:512e5536220e38146176efb833d4a62aa726b7bbff82cfbc8ba9eaa3996e0b17"},
+]
 six = [
     {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
@@ -2210,6 +2238,10 @@ types-s3transfer = [
     {file = "types-s3transfer-0.6.0.post3.tar.gz", hash = "sha256:92c3704e5d041202bfb5ddb79d083fd1a02de2c5dfec6a91576823e6b5c93993"},
     {file = "types_s3transfer-0.6.0.post3-py3-none-any.whl", hash = "sha256:eedc5117275565b3c83662c0ccc81662a34da5dda8bd502b89d296b6d5cb091d"},
 ]
+types-toml = [
+    {file = "types-toml-0.10.8.tar.gz", hash = "sha256:b7e7ea572308b1030dc86c3ba825c5210814c2825612ec679eb7814f8dd9295a"},
+    {file = "types_toml-0.10.8-py3-none-any.whl", hash = "sha256:8300fd093e5829eb9c1fba69cee38130347d4b74ddf32d0a7df650ae55c2b599"},
+]
 types-urllib3 = [
     {file = "types-urllib3-1.26.17.tar.gz", hash = "sha256:73fd274524c3fc7cd8cd9ceb0cb67ed99b45f9cb2831013e46d50c1451044800"},
     {file = "types_urllib3-1.26.17-py3-none-any.whl", hash = "sha256:0d027fcd27dbb3cb532453b4d977e05bc1e13aefd70519866af211b3003d895d"},
diff --git a/pyproject.toml b/pyproject.toml
index 1ee6fbe6f4..765e0b97eb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,12 +28,14 @@ Werkzeug = "2.1.2"
 pytest-order = "^1.0.1"
 allure-pytest = "^2.10.0"
 pytest-asyncio = "^0.19.0"
+toml = "^0.10.2"
 
 [tool.poetry.dev-dependencies]
 flake8 = "^5.0.4"
 mypy = "==0.971"
 black = "^22.6.0"
 isort = "^5.10.1"
+types-toml = "^0.10.8"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 4b2638bb2a..38a0db7cf7 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -970,7 +970,7 @@ class NeonPageserverApiException(Exception):
 
 
 class NeonPageserverHttpClient(requests.Session):
-    def __init__(self, port: int, is_testing_enabled_or_skip, auth_token: Optional[str] = None):
+    def __init__(self, port: int, is_testing_enabled_or_skip: Fn, auth_token: Optional[str] = None):
         super().__init__()
         self.port = port
         self.auth_token = auth_token
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
new file mode 100644
index 0000000000..944ff64390
--- /dev/null
+++ b/test_runner/regress/test_compatibility.py
@@ -0,0 +1,267 @@
+import os
+import re
+import shutil
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, Union
+
+import pytest
+import toml
+from fixtures.neon_fixtures import (
+    NeonCli,
+    NeonEnvBuilder,
+    NeonPageserverHttpClient,
+    PgBin,
+    PortDistributor,
+    wait_for_last_record_lsn,
+    wait_for_upload,
+)
+from fixtures.types import Lsn
+from pytest import FixtureRequest
+
+
+def dump_differs(first: Path, second: Path, output: Path) -> bool:
+    """
+    Runs diff(1) command on two SQL dumps and write the output to the given output file.
+    Returns True if the dumps differ, False otherwise.
+    """
+
+    with output.open("w") as stdout:
+        rv = subprocess.run(
+            [
+                "diff",
+                "--unified",  # Make diff output more readable
+                "--ignore-matching-lines=^--",  # Ignore changes in comments
+                "--ignore-blank-lines",
+                str(first),
+                str(second),
+            ],
+            stdout=stdout,
+        )
+
+    return rv.returncode != 0
+
+
+class PortReplacer(object):
+    """
+    Class-helper for replacing ports in config files.
+    """
+
+    def __init__(self, port_distributor: PortDistributor):
+        self.port_distributor = port_distributor
+        self.port_map: Dict[int, int] = {}
+
+    def replace_port(self, value: Union[int, str]) -> Union[int, str]:
+        if isinstance(value, int):
+            if (known_port := self.port_map.get(value)) is not None:
+                return known_port
+
+            self.port_map[value] = self.port_distributor.get_port()
+            return self.port_map[value]
+
+        if isinstance(value, str):
+            # Use regex to find port in a string
+            # urllib.parse.urlparse produces inconvenient results for cases without scheme like "localhost:5432"
+            # See https://bugs.python.org/issue27657
+            ports = re.findall(r":(\d+)(?:/|$)", value)
+            assert len(ports) == 1, f"can't find port in {value}"
+            port_int = int(ports[0])
+
+            if (known_port := self.port_map.get(port_int)) is not None:
+                return value.replace(f":{port_int}", f":{known_port}")
+
+            self.port_map[port_int] = self.port_distributor.get_port()
+            return value.replace(f":{port_int}", f":{self.port_map[port_int]}")
+
+        raise TypeError(f"unsupported type {type(value)} of {value=}")
+
+
+def test_backward_compatibility(
+    pg_bin: PgBin, port_distributor: PortDistributor, test_output_dir: Path, request: FixtureRequest
+):
+    compatibility_snapshot_dir_env = os.environ.get("COMPATIBILITY_SNAPSHOT_DIR")
+    assert (
+        compatibility_snapshot_dir_env is not None
+    ), "COMPATIBILITY_SNAPSHOT_DIR is not set. It should be set to `compatibility_snapshot_pg14` path generateted by test_prepare_snapshot"
+    compatibility_snapshot_dir = Path(compatibility_snapshot_dir_env).resolve()
+
+    # Make compatibility snapshot artifacts pickupable by Allure
+    # by copying the snapshot directory to the curent test output directory.
+    repo_dir = test_output_dir / "compatibility_snapshot" / "repo"
+
+    shutil.copytree(compatibility_snapshot_dir / "repo", repo_dir)
+
+    # Remove old logs to avoid confusion in test artifacts
+    for logfile in repo_dir.glob("**/*.log"):
+        logfile.unlink()
+
+    # Remove tenants data for computes
+    for tenant in (repo_dir / "pgdatadirs" / "tenants").glob("*"):
+        shutil.rmtree(tenant)
+
+    # Remove wal-redo temp directory
+    for tenant in (repo_dir / "tenants").glob("*"):
+        shutil.rmtree(tenant / "wal-redo-datadir.___temp")
+
+    # Update paths and ports in config files
+    pr = PortReplacer(port_distributor)
+
+    pageserver_toml = repo_dir / "pageserver.toml"
+    pageserver_config = toml.load(pageserver_toml)
+    new_local_path = pageserver_config["remote_storage"]["local_path"].replace(
+        "/test_prepare_snapshot/",
+        "/test_backward_compatibility/compatibility_snapshot/",
+    )
+
+    pageserver_config["remote_storage"]["local_path"] = new_local_path
+    pageserver_config["listen_http_addr"] = pr.replace_port(pageserver_config["listen_http_addr"])
+    pageserver_config["listen_pg_addr"] = pr.replace_port(pageserver_config["listen_pg_addr"])
+    pageserver_config["broker_endpoints"] = [
+        pr.replace_port(ep) for ep in pageserver_config["broker_endpoints"]
+    ]
+
+    with pageserver_toml.open("w") as f:
+        toml.dump(pageserver_config, f)
+
+    snapshot_config_toml = repo_dir / "config"
+    snapshot_config = toml.load(snapshot_config_toml)
+    snapshot_config["etcd_broker"]["broker_endpoints"] = [
+        pr.replace_port(ep) for ep in snapshot_config["etcd_broker"]["broker_endpoints"]
+    ]
+    snapshot_config["pageserver"]["listen_http_addr"] = pr.replace_port(
+        snapshot_config["pageserver"]["listen_http_addr"]
+    )
+    snapshot_config["pageserver"]["listen_pg_addr"] = pr.replace_port(
+        snapshot_config["pageserver"]["listen_pg_addr"]
+    )
+    for sk in snapshot_config["safekeepers"]:
+        sk["http_port"] = pr.replace_port(sk["http_port"])
+        sk["pg_port"] = pr.replace_port(sk["pg_port"])
+
+    with (snapshot_config_toml).open("w") as f:
+        toml.dump(snapshot_config, f)
+
+    # Ensure that snapshot doesn't contain references to the original path
+    rv = subprocess.run(
+        [
+            "grep",
+            "--recursive",
+            "--binary-file=without-match",
+            "--files-with-matches",
+            "test_prepare_snapshot/repo",
+            str(repo_dir),
+        ],
+        capture_output=True,
+        text=True,
+    )
+    assert (
+        rv.returncode != 0
+    ), f"there're files referencing `test_prepare_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}"
+
+    # NeonEnv stub to make NeonCli happy
+    config: Any = type("NeonEnvStub", (object,), {})
+    config.rust_log_override = None
+    config.repo_dir = repo_dir
+    config.pg_version = "14"  # Note: `pg_dumpall` (from pg_bin) version is set by DEFAULT_PG_VERSION_DEFAULT and can be overriden by DEFAULT_PG_VERSION env var
+    config.initial_tenant = snapshot_config["default_tenant_id"]
+
+    # Check that we can start the project
+    cli = NeonCli(config)
+    try:
+        cli.raw_cli(["start"])
+        request.addfinalizer(lambda: cli.raw_cli(["stop"]))
+
+        result = cli.pg_start("main")
+        request.addfinalizer(lambda: cli.pg_stop("main"))
+    except Exception:
+        breaking_changes_allowed = (
+            os.environ.get("ALLOW_BREAKING_CHANGES", "false").lower() == "true"
+        )
+        if breaking_changes_allowed:
+            pytest.xfail("Breaking changes are allowed by ALLOW_BREAKING_CHANGES env var")
+        else:
+            raise
+
+    connstr_all = re.findall(r"Starting postgres node at '([^']+)'", result.stdout)
+    assert len(connstr_all) == 1, f"can't parse connstr from {result.stdout}"
+    connstr = connstr_all[0]
+
+    # Check that the project produces the same dump as the previous version.
+    # The assert itself deferred to the end of the test
+    # to allow us to perform checks that change data before failing
+    pg_bin.run(["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump.sql'}"])
+    initial_dump_differs = dump_differs(
+        compatibility_snapshot_dir / "dump.sql",
+        test_output_dir / "dump.sql",
+        test_output_dir / "dump.filediff",
+    )
+
+    # Check that project can be recovered from WAL
+    # loosely based on https://github.com/neondatabase/cloud/wiki/Recovery-from-WAL
+    tenant_id = snapshot_config["default_tenant_id"]
+    timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
+    pageserver_port = snapshot_config["pageserver"]["listen_http_addr"].split(":")[-1]
+    auth_token = snapshot_config["pageserver"]["auth_token"]
+    pageserver_http = NeonPageserverHttpClient(
+        port=pageserver_port,
+        is_testing_enabled_or_skip=lambda: True,  # TODO: check if testing really enabled
+        auth_token=auth_token,
+    )
+
+    shutil.rmtree(repo_dir / "local_fs_remote_storage")
+    pageserver_http.timeline_delete(tenant_id, timeline_id)
+    pageserver_http.timeline_create(tenant_id, timeline_id)
+    pg_bin.run(
+        ["pg_dumpall", f"--dbname={connstr}", f"--file={test_output_dir / 'dump-from-wal.sql'}"]
+    )
+    # The assert itself deferred to the end of the test
+    # to allow us to perform checks that change data before failing
+    dump_from_wal_differs = dump_differs(
+        test_output_dir / "dump.sql",
+        test_output_dir / "dump-from-wal.sql",
+        test_output_dir / "dump-from-wal.filediff",
+    )
+
+    # Check that we can interract with the data
+    pg_bin.run(["pgbench", "--time=10", "--progress=2", connstr])
+
+    assert not dump_from_wal_differs, "dump from WAL differs"
+    assert not initial_dump_differs, "initial dump differs"
+
+
+@pytest.mark.order(after="test_backward_compatibility")
+# Note: if renaming this test, don't forget to update a reference to it in a workflow file:
+# "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml
+def test_prepare_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path):
+    # The test doesn't really test anything
+    # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`.
+    #
+    # There's no cleanup here, it allows to adjust the data in `test_backward_compatibility` itself without re-collecting it.
+    neon_env_builder.pg_version = "14"
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.enable_local_fs_remote_storage()
+
+    env = neon_env_builder.init_start()
+    pg = env.postgres.create_start("main")
+    pg_bin.run(["pgbench", "--initialize", "--scale=10", pg.connstr()])
+    pg_bin.run(["pgbench", "--time=60", "--progress=2", pg.connstr()])
+    pg_bin.run(["pg_dumpall", f"--dbname={pg.connstr()}", f"--file={test_output_dir / 'dump.sql'}"])
+
+    snapshot_config = toml.load(test_output_dir / "repo" / "config")
+    tenant_id = snapshot_config["default_tenant_id"]
+    timeline_id = dict(snapshot_config["branch_name_mappings"]["main"])[tenant_id]
+
+    pageserver_http = env.pageserver.http_client()
+    lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, lsn)
+    wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
+
+    env.postgres.stop_all()
+    for sk in env.safekeepers:
+        sk.stop()
+    env.pageserver.stop()
+
+    shutil.copytree(test_output_dir, test_output_dir / "compatibility_snapshot_pg14")
+    # Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it

From 7a491f52c451172dd62729b93b35359145ee4661 Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Tue, 25 Oct 2022 11:25:22 -0400
Subject: [PATCH 007/209] Add draw_timeline binary (#2688)

---
 Cargo.lock                              |   7 ++
 Dockerfile                              |   3 +-
 pageserver/Cargo.toml                   |   1 +
 pageserver/src/bin/draw_timeline_dir.rs | 150 ++++++++++++++++++++++++
 4 files changed, 160 insertions(+), 1 deletion(-)
 create mode 100644 pageserver/src/bin/draw_timeline_dir.rs

diff --git a/Cargo.lock b/Cargo.lock
index 13774f7fe6..b39ca6e5a7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2170,6 +2170,7 @@ dependencies = [
  "serde_json",
  "serde_with",
  "signal-hook",
+ "svg_fmt",
  "tar",
  "tempfile",
  "thiserror",
@@ -3461,6 +3462,12 @@ version = "2.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601"
 
+[[package]]
+name = "svg_fmt"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8fb1df15f412ee2e9dfc1c504260fa695c1c3f10fe9f4a6ee2d2184d7d6450e2"
+
 [[package]]
 name = "symbolic-common"
 version = "8.8.0"
diff --git a/Dockerfile b/Dockerfile
index cb4e213687..b0d934d480 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -44,7 +44,7 @@ COPY . .
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin safekeeper --bin proxy --locked --release \
+&& mold -run cargo build --bin pageserver --bin pageserver_binutils --bin draw_timeline_dir --bin safekeeper --bin proxy --locked --release \
     && cachepot -s
 
 # Build final image
@@ -65,6 +65,7 @@ RUN set -e \
 
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver_binutils /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/draw_timeline_dir   /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
 
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 2139e24ee2..b075b86aa1 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -67,6 +67,7 @@ remote_storage = { path = "../libs/remote_storage" }
 workspace_hack = { version = "0.1", path = "../workspace_hack" }
 close_fds = "0.3.2"
 walkdir = "2.3.2"
+svg_fmt = "0.4.1"
 
 [dev-dependencies]
 criterion = "0.4"
diff --git a/pageserver/src/bin/draw_timeline_dir.rs b/pageserver/src/bin/draw_timeline_dir.rs
new file mode 100644
index 0000000000..ea1ff7f3c7
--- /dev/null
+++ b/pageserver/src/bin/draw_timeline_dir.rs
@@ -0,0 +1,150 @@
+//! A tool for visualizing the arrangement of layerfiles within a timeline.
+//!
+//! It reads filenames from stdin and prints a svg on stdout. The image is a plot in
+//! page-lsn space, where every delta layer is a rectangle and every image layer is a
+//! thick line. Legend:
+//! - The x axis (left to right) represents page index.
+//! - The y axis represents LSN, growing upwards.
+//!
+//! Coordinates in both axis are compressed for better readability.
+//! (see https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb)
+//!
+//! Example use:
+//! ```
+//! $ cd test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE
+//! $ ls | grep "__" | cargo run --release --bin draw_timeline_dir > out.svg
+//! $ firefox out.svg
+//! ```
+//!
+//! This API was chosen so that we can easily work with filenames extracted from ssh,
+//! or from pageserver log files.
+//!
+//! TODO Consider shipping this as a grafana panel plugin:
+//!      https://grafana.com/tutorials/build-a-panel-plugin/
+use anyhow::Result;
+use pageserver::repository::Key;
+use std::cmp::Ordering;
+use std::io::{self, BufRead};
+use std::{
+    collections::{BTreeMap, BTreeSet},
+    ops::Range,
+};
+use svg_fmt::{rectangle, rgb, BeginSvg, EndSvg, Fill, Stroke};
+use utils::{lsn::Lsn, project_git_version};
+
+project_git_version!(GIT_VERSION);
+
+// Map values to their compressed coordinate - the index the value
+// would have in a sorted and deduplicated list of all values.
+fn build_coordinate_compression_map<T: Ord + Copy>(coords: Vec<T>) -> BTreeMap<T, usize> {
+    let set: BTreeSet<T> = coords.into_iter().collect();
+
+    let mut map: BTreeMap<T, usize> = BTreeMap::new();
+    for (i, e) in set.iter().enumerate() {
+        map.insert(*e, i);
+    }
+
+    map
+}
+
+fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {
+    let split: Vec<&str> = name.split("__").collect();
+    let keys: Vec<&str> = split[0].split('-').collect();
+    let mut lsns: Vec<&str> = split[1].split('-').collect();
+    if lsns.len() == 1 {
+        lsns.push(lsns[0]);
+    }
+
+    let keys = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
+    let lsns = Lsn::from_hex(lsns[0]).unwrap()..Lsn::from_hex(lsns[1]).unwrap();
+    (keys, lsns)
+}
+
+fn main() -> Result<()> {
+    // Parse layer filenames from stdin
+    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
+    let stdin = io::stdin();
+    for line in stdin.lock().lines() {
+        let range = parse_filename(&line.unwrap());
+        ranges.push(range);
+    }
+
+    // Collect all coordinates
+    let mut keys: Vec<Key> = vec![];
+    let mut lsns: Vec<Lsn> = vec![];
+    for (keyr, lsnr) in &ranges {
+        keys.push(keyr.start);
+        keys.push(keyr.end);
+        lsns.push(lsnr.start);
+        lsns.push(lsnr.end);
+    }
+
+    // Analyze
+    let key_map = build_coordinate_compression_map(keys);
+    let lsn_map = build_coordinate_compression_map(lsns);
+
+    // Initialize stats
+    let mut num_deltas = 0;
+    let mut num_images = 0;
+
+    // Draw
+    let stretch = 3.0; // Stretch out vertically for better visibility
+    println!(
+        "{}",
+        BeginSvg {
+            w: key_map.len() as f32,
+            h: stretch * lsn_map.len() as f32
+        }
+    );
+    for (keyr, lsnr) in &ranges {
+        let key_start = *key_map.get(&keyr.start).unwrap();
+        let key_end = *key_map.get(&keyr.end).unwrap();
+        let key_diff = key_end - key_start;
+        let lsn_max = lsn_map.len();
+
+        if key_start >= key_end {
+            panic!("Invalid key range {}-{}", key_start, key_end);
+        }
+
+        let lsn_start = *lsn_map.get(&lsnr.start).unwrap();
+        let lsn_end = *lsn_map.get(&lsnr.end).unwrap();
+
+        let mut lsn_diff = (lsn_end - lsn_start) as f32;
+        let mut fill = Fill::None;
+        let mut margin = 0.05 * lsn_diff; // Height-dependent margin to disambiguate overlapping deltas
+        let mut lsn_offset = 0.0;
+
+        // Fill in and thicken rectangle if it's an
+        // image layer so that we can see it.
+        match lsn_start.cmp(&lsn_end) {
+            Ordering::Less => num_deltas += 1,
+            Ordering::Equal => {
+                num_images += 1;
+                lsn_diff = 0.3;
+                lsn_offset = -lsn_diff / 2.0;
+                margin = 0.05;
+                fill = Fill::Color(rgb(0, 0, 0));
+            }
+            Ordering::Greater => panic!("Invalid lsn range {}-{}", lsn_start, lsn_end),
+        }
+
+        println!(
+            "    {}",
+            rectangle(
+                key_start as f32 + stretch * margin,
+                stretch * (lsn_max as f32 - (lsn_end as f32 - margin - lsn_offset)),
+                key_diff as f32 - stretch * 2.0 * margin,
+                stretch * (lsn_diff - 2.0 * margin)
+            )
+            .fill(fill)
+            .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
+            .border_radius(0.4)
+        );
+    }
+    println!("{}", EndSvg);
+
+    eprintln!("num_images: {}", num_images);
+    eprintln!("num_deltas: {}", num_deltas);
+
+    Ok(())
+}

From 70c3d18bb0c72992de1438c0f77a4e2b9c72fcab Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Wed, 26 Oct 2022 02:51:23 +0300
Subject: [PATCH 008/209] Do not release to new staging proxies on release
 (#2685)

---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 660f93b025..1b8b380179 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -832,7 +832,7 @@ jobs:
     # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
     needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
     if: |
-      (github.ref_name == 'main' || github.ref_name == 'release') &&
+      (github.ref_name == 'main') &&
       github.event_name != 'workflow_dispatch'
     defaults:
       run:

From ede70d833cc547c00dd6496eee4483a98e39a04a Mon Sep 17 00:00:00 2001
From: mikecaat <35882227+mikecaat@users.noreply.github.com>
Date: Wed, 26 Oct 2022 19:59:25 +0900
Subject: [PATCH 009/209] Add a docker-compose example file (#1943) (#2666)

Co-authored-by: Masahiro Ikeda <masahiro.ikeda.us@hco.ntt.co.jp>
---
 docker-compose/compute/shell/compute.sh       |  48 +++++
 .../compute/var/db/postgres/specs/spec.json   | 141 ++++++++++++
 docker-compose/docker-compose.yml             | 200 ++++++++++++++++++
 docker-compose/image/compute/Dockerfile       |  10 +
 docs/docker.md                                |  64 ++++++
 scripts/docker-compose_test.sh                |  51 +++++
 6 files changed, 514 insertions(+)
 create mode 100755 docker-compose/compute/shell/compute.sh
 create mode 100644 docker-compose/compute/var/db/postgres/specs/spec.json
 create mode 100644 docker-compose/docker-compose.yml
 create mode 100644 docker-compose/image/compute/Dockerfile
 create mode 100755 scripts/docker-compose_test.sh

diff --git a/docker-compose/compute/shell/compute.sh b/docker-compose/compute/shell/compute.sh
new file mode 100755
index 0000000000..cef2b485f3
--- /dev/null
+++ b/docker-compose/compute/shell/compute.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+set -eux
+
+PG_VERSION=${PG_VERSION:-14}
+
+SPEC_FILE_ORG=/var/db/postgres/specs/spec.json
+SPEC_FILE=/tmp/spec.json
+
+echo "Waiting pageserver become ready."
+while ! nc -z pageserver 6400; do
+     sleep 1;
+done
+echo "Page server is ready."
+
+echo "Create a tenant and timeline"
+PARAMS=(
+     -sb 
+     -X POST
+     -H "Content-Type: application/json"
+     -d "{}"
+     http://pageserver:9898/v1/tenant/
+)
+tenant_id=$(curl "${PARAMS[@]}" | sed 's/"//g')
+
+PARAMS=(
+     -sb 
+     -X POST
+     -H "Content-Type: application/json"
+     -d "{\"tenant_id\":\"${tenant_id}\", \"pg_version\": ${PG_VERSION}}"
+     "http://pageserver:9898/v1/tenant/${tenant_id}/timeline/"
+)
+result=$(curl "${PARAMS[@]}")
+echo $result | jq .
+
+echo "Overwrite tenant id and timeline id in spec file"
+tenant_id=$(echo ${result} | jq -r .tenant_id)
+timeline_id=$(echo ${result} | jq -r .timeline_id)
+
+sed "s/TENANT_ID/${tenant_id}/" ${SPEC_FILE_ORG} > ${SPEC_FILE}
+sed -i "s/TIMELINE_ID/${timeline_id}/" ${SPEC_FILE}
+
+cat ${SPEC_FILE}
+
+echo "Start compute node"
+/usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
+     -C "postgresql://cloud_admin@localhost:55433/postgres"  \
+     -b /usr/local/bin/postgres                              \
+     -S ${SPEC_FILE}
diff --git a/docker-compose/compute/var/db/postgres/specs/spec.json b/docker-compose/compute/var/db/postgres/specs/spec.json
new file mode 100644
index 0000000000..10ae0b0ecf
--- /dev/null
+++ b/docker-compose/compute/var/db/postgres/specs/spec.json
@@ -0,0 +1,141 @@
+{
+    "format_version": 1.0,
+
+    "timestamp": "2022-10-12T18:00:00.000Z",
+    "operation_uuid": "0f657b36-4b0f-4a2d-9c2e-1dcd615e7d8c",
+
+    "cluster": {
+        "cluster_id": "docker_compose",
+        "name": "docker_compose_test",
+        "state": "restarted",
+        "roles": [
+            {
+                "name": "cloud_admin",
+                "encrypted_password": "b093c0d3b281ba6da1eacc608620abd8",
+                "options": null
+            }
+        ],
+        "databases": [
+        ],
+        "settings": [
+            {
+                "name": "fsync",
+                "value": "off",
+                "vartype": "bool"
+            },
+            {
+                "name": "wal_level",
+                "value": "replica",
+                "vartype": "enum"
+            },
+            {
+                "name": "hot_standby",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "wal_log_hints",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "log_connections",
+                "value": "on",
+                "vartype": "bool"
+            },
+            {
+                "name": "port",
+                "value": "55433",
+                "vartype": "integer"
+            },
+            {
+                "name": "shared_buffers",
+                "value": "1MB",
+                "vartype": "string"
+            },
+            {
+                "name": "max_connections",
+                "value": "100",
+                "vartype": "integer"
+            },
+            {
+                "name": "listen_addresses",
+                "value": "0.0.0.0",
+                "vartype": "string"
+            },
+            {
+                "name": "max_wal_senders",
+                "value": "10",
+                "vartype": "integer"
+            },
+            {
+                "name": "max_replication_slots",
+                "value": "10",
+                "vartype": "integer"
+            },
+            {
+                "name": "wal_sender_timeout",
+                "value": "5s",
+                "vartype": "string"
+            },
+            {
+                "name": "wal_keep_size",
+                "value": "0",
+                "vartype": "integer"
+            },
+            {
+                "name": "password_encryption",
+                "value": "md5",
+                "vartype": "enum"
+            },
+            {
+                "name": "restart_after_crash",
+                "value": "off",
+                "vartype": "bool"
+            },
+            {
+                "name": "synchronous_standby_names",
+                "value": "walproposer",
+                "vartype": "string"
+            },
+            {
+                "name": "shared_preload_libraries",
+                "value": "neon",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.safekeepers",
+                "value": "safekeeper1:5454,safekeeper2:5454,safekeeper3:5454",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.timeline_id",
+                "value": "TIMELINE_ID",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.tenant_id",
+                "value": "TENANT_ID",
+                "vartype": "string"
+            },
+            {
+                "name": "neon.pageserver_connstring",
+                "value": "host=pageserver port=6400",
+                "vartype": "string"
+            },
+            {
+                "name": "max_replication_write_lag",
+                "value": "500MB",
+                "vartype": "string"
+            },
+            {
+                "name": "max_replication_flush_lag",
+                "value": "10GB",
+                "vartype": "string"
+            }
+        ]
+    },
+
+    "delta_operations": [
+    ]
+}
diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
new file mode 100644
index 0000000000..9ab775c3f9
--- /dev/null
+++ b/docker-compose/docker-compose.yml
@@ -0,0 +1,200 @@
+version: '3'
+
+services:
+  etcd:
+    image: quay.io/coreos/etcd:v3.5.4
+    ports:
+      - 2379:2379
+      - 2380:2380
+    environment:
+      # This signifficantly speeds up etcd and we anyway don't data persistency there.
+      ETCD_UNSAFE_NO_FSYNC: "1"
+    command: 
+      - "etcd"
+      - "--auto-compaction-mode=revision"
+      - "--auto-compaction-retention=1"
+      - "--name=etcd-cluster"
+      - "--initial-cluster-state=new"
+      - "--initial-cluster-token=etcd-cluster-1"
+      - "--initial-cluster=etcd-cluster=http://etcd:2380"
+      - "--initial-advertise-peer-urls=http://etcd:2380"
+      - "--advertise-client-urls=http://etcd:2379"
+      - "--listen-client-urls=http://0.0.0.0:2379"
+      - "--listen-peer-urls=http://0.0.0.0:2380"
+      - "--quota-backend-bytes=134217728" # 128 MB
+
+  minio:
+    image: quay.io/minio/minio:RELEASE.2022-10-20T00-55-09Z
+    ports:
+      - 9000:9000
+      - 9001:9001
+    environment:
+      - MINIO_ROOT_USER=minio
+      - MINIO_ROOT_PASSWORD=password
+    command: server /data --address :9000 --console-address ":9001"
+
+  minio_create_buckets:
+    image: minio/mc
+    environment:
+      - MINIO_ROOT_USER=minio
+      - MINIO_ROOT_PASSWORD=password
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command: 
+      - "until (/usr/bin/mc alias set minio http://minio:9000 $$MINIO_ROOT_USER $$MINIO_ROOT_PASSWORD) do
+             echo 'Waiting to start minio...' && sleep 1;
+         done;
+         /usr/bin/mc mb minio/neon --region=eu-north-1;
+         exit 0;"
+    depends_on:
+      - minio
+
+  pageserver:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - BROKER_ENDPOINT='http://etcd:2379'
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+       #- 6400:6400  # pg protocol handler
+       - 9898:9898 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "/usr/local/bin/pageserver -D /data/.neon/
+                                   -c \"broker_endpoints=[$$BROKER_ENDPOINT]\"
+                                   -c \"listen_pg_addr='0.0.0.0:6400'\"
+                                   -c \"listen_http_addr='0.0.0.0:9898'\"
+                                   -c \"remote_storage={endpoint='http://minio:9000',
+                                                        bucket_name='neon',
+                                                        bucket_region='eu-north-1',
+                                                        prefix_in_bucket='/pageserver/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  safekeeper1:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - SAFEKEEPER_ADVERTISE_URL=safekeeper1:5454
+      - SAFEKEEPER_ID=1
+      - BROKER_ENDPOINT=http://etcd:2379
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+      #- 5454:5454 # pg protocol handler
+      - 7676:7676 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
+                    --listen-http='0.0.0.0:7676'
+                    --id=$$SAFEKEEPER_ID
+                    --broker-endpoints=$$BROKER_ENDPOINT
+                    -D /data
+                    --remote-storage=\"{endpoint='http://minio:9000',
+                                        bucket_name='neon',
+                                        bucket_region='eu-north-1',
+                                        prefix_in_bucket='/safekeeper/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  safekeeper2:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - SAFEKEEPER_ADVERTISE_URL=safekeeper2:5454
+      - SAFEKEEPER_ID=2
+      - BROKER_ENDPOINT=http://etcd:2379
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+      #- 5454:5454 # pg protocol handler
+      - 7677:7676 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
+                    --listen-http='0.0.0.0:7676'
+                    --id=$$SAFEKEEPER_ID
+                    --broker-endpoints=$$BROKER_ENDPOINT
+                    -D /data
+                    --remote-storage=\"{endpoint='http://minio:9000',
+                                        bucket_name='neon',
+                                        bucket_region='eu-north-1',
+                                        prefix_in_bucket='/safekeeper/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  safekeeper3:
+    image: neondatabase/neon:${TAG:-latest}
+    environment:
+      - SAFEKEEPER_ADVERTISE_URL=safekeeper3:5454
+      - SAFEKEEPER_ID=3
+      - BROKER_ENDPOINT=http://etcd:2379
+      - AWS_ACCESS_KEY_ID=minio
+      - AWS_SECRET_ACCESS_KEY=password
+      #- RUST_BACKTRACE=1
+    ports:
+      #- 5454:5454 # pg protocol handler
+      - 7678:7676 # http endpoints
+    entrypoint:
+      - "/bin/sh"
+      - "-c"
+    command:
+      - "safekeeper --listen-pg=$$SAFEKEEPER_ADVERTISE_URL
+                    --listen-http='0.0.0.0:7676'
+                    --id=$$SAFEKEEPER_ID
+                    --broker-endpoints=$$BROKER_ENDPOINT
+                    -D /data
+                    --remote-storage=\"{endpoint='http://minio:9000',
+                                        bucket_name='neon',
+                                        bucket_region='eu-north-1',
+                                        prefix_in_bucket='/safekeeper/'}\""
+    depends_on:
+      - etcd
+      - minio_create_buckets
+
+  compute:
+    build:
+      context: ./image/compute
+      args:
+        - COMPUTE_IMAGE=compute-node-v${PG_VERSION:-14}:${TAG:-latest}
+        - http_proxy=$http_proxy
+        - https_proxy=$https_proxy
+    environment:
+      - PG_VERSION=${PG_VERSION:-14}
+      #- RUST_BACKTRACE=1
+    volumes:
+      - ./compute/var/db/postgres/specs/:/var/db/postgres/specs/
+      - ./compute/shell/:/shell/
+    ports:
+      - 55433:55433 # pg protocol handler
+      - 3080:3080 # http endpoints
+    entrypoint:
+      - "/shell/compute.sh"
+    depends_on:
+      - safekeeper1
+      - safekeeper2
+      - safekeeper3
+      - pageserver
+
+  compute_is_ready:
+    image: postgres:latest
+    entrypoint:
+      - "/bin/bash"
+      - "-c"
+    command:
+      - "until pg_isready -h compute -p 55433 ; do
+            echo 'Waiting to start compute...' && sleep 1;
+         done"
+    depends_on:
+      - compute
diff --git a/docker-compose/image/compute/Dockerfile b/docker-compose/image/compute/Dockerfile
new file mode 100644
index 0000000000..1b9d8c4900
--- /dev/null
+++ b/docker-compose/image/compute/Dockerfile
@@ -0,0 +1,10 @@
+ARG COMPUTE_IMAGE=compute-node-v14:latest
+FROM neondatabase/${COMPUTE_IMAGE}
+
+USER root
+RUN apt-get update &&       \
+    apt-get install -y curl \
+                       jq   \
+                       netcat
+
+USER postgres
diff --git a/docs/docker.md b/docs/docker.md
index 100cdd248b..42f0048e6f 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -18,3 +18,67 @@ We build all images after a successful `release` tests run and push automaticall
 1. `neondatabase/compute-tools` and `neondatabase/compute-node`
 
 2. `neondatabase/neon`
+
+## Docker Compose example
+
+You can see a [docker compose](https://docs.docker.com/compose/) example to create a neon cluster in [/docker-compose/docker-compose.yml](/docker-compose/docker-compose.yml). It creates the following conatainers.
+
+- etcd x 1
+- pageserver x 1
+- safekeeper x 3
+- compute x 1
+- MinIO x 1        # This is Amazon S3 compatible object storage
+
+### How to use
+
+1. create containers
+
+You can specify version of neon cluster using following environment values.
+- PG_VERSION: postgres version for compute (default is 14)
+- TAG: the tag version of [docker image](https://registry.hub.docker.com/r/neondatabase/neon/tags) (default is latest), which is tagged in [CI test](/.github/workflows/build_and_test.yml)
+```
+$ cd docker-compose/docker-compose.yml
+$ docker-compose down   # remove the conainers if exists
+$ PG_VERSION=15 TAG=2221 docker-compose up --build -d  # You can specify the postgres and image version
+Creating network "dockercompose_default" with the default driver
+Creating dockercompose_etcd3_1 ...
+(...omit...)
+```
+
+2. connect compute node
+```
+$ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
+$ psql -h localhost -p 55433 -U cloud_admin
+postgres=# CREATE TABLE t(key int primary key, value text);
+CREATE TABLE
+postgres=# insert into t values(1,1);
+INSERT 0 1
+postgres=# select * from t;
+ key | value
+-----+-------
+   1 | 1
+(1 row)
+```
+
+3. If you want to see the log, you can use `docker-compose logs` command.
+```
+# check the container name you want to see
+$ docker ps
+CONTAINER ID   IMAGE                                              COMMAND                  CREATED         STATUS         PORTS                                                                                                                                  NAMES
+d6968a5ae912   dockercompose_compute                              "/shell/compute.sh"      5 minutes ago   Up 5 minutes   0.0.0.0:3080->3080/tcp, 0.0.0.0:55433->55433/tcp                                                                                       dockercompose_compute_1
+(...omit...)
+
+$ docker logs -f dockercompose_compute_1
+2022-10-21 06:15:48.757 GMT [56] LOG:  connection authorized: user=cloud_admin database=postgres application_name=psql
+2022-10-21 06:17:00.307 GMT [56] LOG:  [NEON_SMGR] libpagestore: connected to 'host=pageserver port=6400'
+(...omit...)
+```
+
+4. If you want to see durable data in MinIO which is s3 compatible storage
+
+Access http://localhost:9001 and sign in.
+
+- Username: `minio`
+- Password: `password`
+
+You can see durable pages and WAL data in `neon` bucket.
\ No newline at end of file
diff --git a/scripts/docker-compose_test.sh b/scripts/docker-compose_test.sh
new file mode 100755
index 0000000000..b4551365f8
--- /dev/null
+++ b/scripts/docker-compose_test.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# this is a shortcut script to avoid duplication in CI
+set -eux -o pipefail
+
+SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+COMPOSE_FILE=$SCRIPT_DIR/../docker-compose/docker-compose.yml
+
+COMPUTE_CONTAINER_NAME=dockercompose_compute_1
+SQL="CREATE TABLE t(key int primary key, value text); insert into t values(1,1); select * from t;"
+PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -c '$SQL' postgres"
+
+cleanup() {
+	echo "show container information"
+	docker ps
+	docker-compose -f $COMPOSE_FILE logs
+	echo "stop containers..."
+	docker-compose -f $COMPOSE_FILE down
+}
+
+echo "clean up containers if exists"
+cleanup
+
+for pg_version in 14 15; do
+	echo "start containers (pg_version=$pg_version)."
+	PG_VERSION=$pg_version TAG=latest docker-compose -f $COMPOSE_FILE up --build -d
+
+	echo "wait until the compute is ready. timeout after 60s. "
+	cnt=0
+	while sleep 1; do
+		# check timeout
+		cnt=`expr $cnt + 1`
+		if [ $cnt -gt 60 ]; then
+			echo "timeout before the compute is ready."
+			cleanup
+			exit 1
+		fi
+
+		# check if the compute is ready
+		set +o pipefail
+		result=`docker-compose -f $COMPOSE_FILE logs "compute_is_ready" | grep "accepting connections" | wc -l`
+		set -o pipefail
+		if [ $result -eq 1 ]; then
+			echo "OK. The compute is ready to connect."
+			echo "execute simple queries."
+			docker exec -it $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
+			cleanup
+			break
+		fi
+	done
+done

From b4c55f5d2445452329ee09527da1c3080503e23a Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Wed, 26 Oct 2022 17:32:31 -0400
Subject: [PATCH 010/209] Move pagestream api to libs/pageserver_api (#2698)

---
 Cargo.lock                                    |   3 +
 libs/pageserver_api/Cargo.toml                |   3 +
 libs/pageserver_api/src/lib.rs                |   1 +
 libs/pageserver_api/src/models.rs             | 161 +++++++++++++++++
 .../pageserver_api}/src/reltag.rs             |   0
 pageserver/src/basebackup.rs                  |   2 +-
 pageserver/src/import_datadir.rs              |   2 +-
 pageserver/src/lib.rs                         |   1 -
 pageserver/src/page_service.rs                | 166 +-----------------
 pageserver/src/pgdatadir_mapping.rs           |   2 +-
 pageserver/src/tenant/timeline.rs             |   2 +-
 pageserver/src/walingest.rs                   |   2 +-
 pageserver/src/walredo.rs                     |   2 +-
 13 files changed, 181 insertions(+), 166 deletions(-)
 rename {pageserver => libs/pageserver_api}/src/reltag.rs (100%)

diff --git a/Cargo.lock b/Cargo.lock
index b39ca6e5a7..3e67126add 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2189,7 +2189,10 @@ dependencies = [
 name = "pageserver_api"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
+ "bytes",
  "const_format",
+ "postgres_ffi",
  "serde",
  "serde_with",
  "utils",
diff --git a/libs/pageserver_api/Cargo.toml b/libs/pageserver_api/Cargo.toml
index 5995325a2f..9121cd4989 100644
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -7,6 +7,9 @@ edition = "2021"
 serde = { version = "1.0", features = ["derive"] }
 serde_with = "2.0"
 const_format = "0.2.21"
+anyhow = { version = "1.0", features = ["backtrace"] }
+bytes = "1.0.1"
 
 utils = { path = "../utils" }
+postgres_ffi = { path = "../postgres_ffi" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs
index a36c1692a9..4890d54f36 100644
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -2,6 +2,7 @@ use const_format::formatcp;
 
 /// Public API types
 pub mod models;
+pub mod reltag;
 
 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index dd40ba9e1c..4360f76fd1 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -7,6 +7,10 @@ use utils::{
     lsn::Lsn,
 };
 
+use crate::reltag::RelTag;
+use anyhow::bail;
+use bytes::{Buf, BufMut, Bytes, BytesMut};
+
 /// A state of a tenant in pageserver's memory.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TenantState {
@@ -219,3 +223,160 @@ pub struct FailpointConfig {
 pub struct TimelineGcRequest {
     pub gc_horizon: Option<u64>,
 }
+
+// Wrapped in libpq CopyData
+pub enum PagestreamFeMessage {
+    Exists(PagestreamExistsRequest),
+    Nblocks(PagestreamNblocksRequest),
+    GetPage(PagestreamGetPageRequest),
+    DbSize(PagestreamDbSizeRequest),
+}
+
+// Wrapped in libpq CopyData
+pub enum PagestreamBeMessage {
+    Exists(PagestreamExistsResponse),
+    Nblocks(PagestreamNblocksResponse),
+    GetPage(PagestreamGetPageResponse),
+    Error(PagestreamErrorResponse),
+    DbSize(PagestreamDbSizeResponse),
+}
+
+#[derive(Debug)]
+pub struct PagestreamExistsRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub rel: RelTag,
+}
+
+#[derive(Debug)]
+pub struct PagestreamNblocksRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub rel: RelTag,
+}
+
+#[derive(Debug)]
+pub struct PagestreamGetPageRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub rel: RelTag,
+    pub blkno: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamDbSizeRequest {
+    pub latest: bool,
+    pub lsn: Lsn,
+    pub dbnode: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamExistsResponse {
+    pub exists: bool,
+}
+
+#[derive(Debug)]
+pub struct PagestreamNblocksResponse {
+    pub n_blocks: u32,
+}
+
+#[derive(Debug)]
+pub struct PagestreamGetPageResponse {
+    pub page: Bytes,
+}
+
+#[derive(Debug)]
+pub struct PagestreamErrorResponse {
+    pub message: String,
+}
+
+#[derive(Debug)]
+pub struct PagestreamDbSizeResponse {
+    pub db_size: i64,
+}
+
+impl PagestreamFeMessage {
+    pub fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
+        // TODO these gets can fail
+
+        // these correspond to the NeonMessageTag enum in pagestore_client.h
+        //
+        // TODO: consider using protobuf or serde bincode for less error prone
+        // serialization.
+        let msg_tag = body.get_u8();
+        match msg_tag {
+            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+            })),
+            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+            })),
+            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                rel: RelTag {
+                    spcnode: body.get_u32(),
+                    dbnode: body.get_u32(),
+                    relnode: body.get_u32(),
+                    forknum: body.get_u8(),
+                },
+                blkno: body.get_u32(),
+            })),
+            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
+                latest: body.get_u8() != 0,
+                lsn: Lsn::from(body.get_u64()),
+                dbnode: body.get_u32(),
+            })),
+            _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
+        }
+    }
+}
+
+impl PagestreamBeMessage {
+    pub fn serialize(&self) -> Bytes {
+        let mut bytes = BytesMut::new();
+
+        match self {
+            Self::Exists(resp) => {
+                bytes.put_u8(100); /* tag from pagestore_client.h */
+                bytes.put_u8(resp.exists as u8);
+            }
+
+            Self::Nblocks(resp) => {
+                bytes.put_u8(101); /* tag from pagestore_client.h */
+                bytes.put_u32(resp.n_blocks);
+            }
+
+            Self::GetPage(resp) => {
+                bytes.put_u8(102); /* tag from pagestore_client.h */
+                bytes.put(&resp.page[..]);
+            }
+
+            Self::Error(resp) => {
+                bytes.put_u8(103); /* tag from pagestore_client.h */
+                bytes.put(resp.message.as_bytes());
+                bytes.put_u8(0); // null terminator
+            }
+            Self::DbSize(resp) => {
+                bytes.put_u8(104); /* tag from pagestore_client.h */
+                bytes.put_i64(resp.db_size);
+            }
+        }
+
+        bytes.into()
+    }
+}
diff --git a/pageserver/src/reltag.rs b/libs/pageserver_api/src/reltag.rs
similarity index 100%
rename from pageserver/src/reltag.rs
rename to libs/pageserver_api/src/reltag.rs
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index d0a57a473b..973c3cd3a6 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -22,8 +22,8 @@ use std::time::SystemTime;
 use tar::{Builder, EntryType, Header};
 use tracing::*;
 
-use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
+use pageserver_api::reltag::{RelTag, SlruKind};
 
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
 use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index ee3dc684e3..642e41765b 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -12,10 +12,10 @@ use tracing::*;
 use walkdir::WalkDir;
 
 use crate::pgdatadir_mapping::*;
-use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
 use crate::walrecord::DecodedWALRecord;
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::*;
 use postgres_ffi::waldecoder::WalStreamDecoder;
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index c75f940386..52a4cb0381 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -8,7 +8,6 @@ pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
 pub mod profiling;
-pub mod reltag;
 pub mod repository;
 pub mod storage_sync;
 pub mod task_mgr;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index d61885314e..aec91bc7f1 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -10,8 +10,14 @@
 //
 
 use anyhow::{bail, ensure, Context, Result};
-use bytes::{Buf, BufMut, Bytes, BytesMut};
+use bytes::Bytes;
 use futures::{Stream, StreamExt};
+use pageserver_api::models::{
+    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
+    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
+    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
+    PagestreamNblocksRequest, PagestreamNblocksResponse,
+};
 use std::io;
 use std::net::TcpListener;
 use std::str;
@@ -35,7 +41,6 @@ use crate::config::{PageServerConf, ProfilingConfig};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::profiling::profpoint_start;
-use crate::reltag::RelTag;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::Timeline;
@@ -45,163 +50,6 @@ use crate::CheckpointConfig;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
 
-// Wrapped in libpq CopyData
-enum PagestreamFeMessage {
-    Exists(PagestreamExistsRequest),
-    Nblocks(PagestreamNblocksRequest),
-    GetPage(PagestreamGetPageRequest),
-    DbSize(PagestreamDbSizeRequest),
-}
-
-// Wrapped in libpq CopyData
-enum PagestreamBeMessage {
-    Exists(PagestreamExistsResponse),
-    Nblocks(PagestreamNblocksResponse),
-    GetPage(PagestreamGetPageResponse),
-    Error(PagestreamErrorResponse),
-    DbSize(PagestreamDbSizeResponse),
-}
-
-#[derive(Debug)]
-struct PagestreamExistsRequest {
-    latest: bool,
-    lsn: Lsn,
-    rel: RelTag,
-}
-
-#[derive(Debug)]
-struct PagestreamNblocksRequest {
-    latest: bool,
-    lsn: Lsn,
-    rel: RelTag,
-}
-
-#[derive(Debug)]
-struct PagestreamGetPageRequest {
-    latest: bool,
-    lsn: Lsn,
-    rel: RelTag,
-    blkno: u32,
-}
-
-#[derive(Debug)]
-struct PagestreamDbSizeRequest {
-    latest: bool,
-    lsn: Lsn,
-    dbnode: u32,
-}
-
-#[derive(Debug)]
-struct PagestreamExistsResponse {
-    exists: bool,
-}
-
-#[derive(Debug)]
-struct PagestreamNblocksResponse {
-    n_blocks: u32,
-}
-
-#[derive(Debug)]
-struct PagestreamGetPageResponse {
-    page: Bytes,
-}
-
-#[derive(Debug)]
-struct PagestreamErrorResponse {
-    message: String,
-}
-
-#[derive(Debug)]
-struct PagestreamDbSizeResponse {
-    db_size: i64,
-}
-
-impl PagestreamFeMessage {
-    fn parse(mut body: Bytes) -> anyhow::Result<PagestreamFeMessage> {
-        // TODO these gets can fail
-
-        // these correspond to the NeonMessageTag enum in pagestore_client.h
-        //
-        // TODO: consider using protobuf or serde bincode for less error prone
-        // serialization.
-        let msg_tag = body.get_u8();
-        match msg_tag {
-            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                },
-            })),
-            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                },
-            })),
-            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                rel: RelTag {
-                    spcnode: body.get_u32(),
-                    dbnode: body.get_u32(),
-                    relnode: body.get_u32(),
-                    forknum: body.get_u8(),
-                },
-                blkno: body.get_u32(),
-            })),
-            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                latest: body.get_u8() != 0,
-                lsn: Lsn::from(body.get_u64()),
-                dbnode: body.get_u32(),
-            })),
-            _ => bail!("unknown smgr message tag: {},'{:?}'", msg_tag, body),
-        }
-    }
-}
-
-impl PagestreamBeMessage {
-    fn serialize(&self) -> Bytes {
-        let mut bytes = BytesMut::new();
-
-        match self {
-            Self::Exists(resp) => {
-                bytes.put_u8(100); /* tag from pagestore_client.h */
-                bytes.put_u8(resp.exists as u8);
-            }
-
-            Self::Nblocks(resp) => {
-                bytes.put_u8(101); /* tag from pagestore_client.h */
-                bytes.put_u32(resp.n_blocks);
-            }
-
-            Self::GetPage(resp) => {
-                bytes.put_u8(102); /* tag from pagestore_client.h */
-                bytes.put(&resp.page[..]);
-            }
-
-            Self::Error(resp) => {
-                bytes.put_u8(103); /* tag from pagestore_client.h */
-                bytes.put(resp.message.as_bytes());
-                bytes.put_u8(0); // null terminator
-            }
-            Self::DbSize(resp) => {
-                bytes.put_u8(104); /* tag from pagestore_client.h */
-                bytes.put_i64(resp.db_size);
-            }
-        }
-
-        bytes.into()
-    }
-}
-
 fn copyin_stream(pgb: &mut PostgresBackend) -> impl Stream<Item = io::Result<Bytes>> + '_ {
     async_stream::try_stream! {
         loop {
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index ca931ed37d..0e334a63df 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -7,12 +7,12 @@
 //! Clarify that)
 //!
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-use crate::reltag::{RelTag, SlruKind};
 use crate::repository::*;
 use crate::tenant::Timeline;
 use crate::walrecord::NeonWalRecord;
 use anyhow::{bail, ensure, Result};
 use bytes::{Buf, Bytes};
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, TimestampTz, TransactionId};
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 194ca0d857..6a96254df4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -37,8 +37,8 @@ use crate::metrics::TimelineMetrics;
 use crate::pgdatadir_mapping::BlockNumber;
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
-use crate::reltag::RelTag;
 use crate::tenant_config::TenantConfOpt;
+use pageserver_api::reltag::RelTag;
 
 use postgres_ffi::to_pg_timestamp;
 use utils::{
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 9a6b99d991..8c81ed824b 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -31,10 +31,10 @@ use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
 
 use crate::pgdatadir_mapping::*;
-use crate::reltag::{RelTag, SlruKind};
 use crate::tenant::Timeline;
 use crate::walrecord::*;
 use crate::ZERO_PAGE;
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index e683c301d8..1cde11082e 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -43,10 +43,10 @@ use crate::metrics::{
     WAL_REDO_WAIT_TIME,
 };
 use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
-use crate::reltag::{RelTag, SlruKind};
 use crate::repository::Key;
 use crate::walrecord::NeonWalRecord;
 use crate::{config::PageServerConf, TEMP_FILE_SUFFIX};
+use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
 use postgres_ffi::v14::nonrelfile_utils::{

From ab0be7b8da124839dbaf5fc85978d292e6ef427c Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Thu, 27 Oct 2022 03:50:46 +0300
Subject: [PATCH 011/209] Avoid debian-testing packages in compute Dockerfiles

plv8 can only be built with a fairly new gold linker version. We used to install
it via binutils packages from testing, but it also updates libc and that causes
troubles in the resulting image as different extensions were built against
different libc versions. We could either use libc from debian-testing everywhere
or restrain from using testing packages and install necessary programs manually.
This patch uses the latter approach: gold for plv8 and cmake for h3 are
installed manually.

In a passing declare h3_postgis as a safe extension (previous omission).
---
 Dockerfile.compute-node-v14 | 87 ++++++++++++++++++++++---------------
 Dockerfile.compute-node-v15 | 74 ++++++++++++++++++-------------
 2 files changed, 95 insertions(+), 66 deletions(-)

diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node-v14
index 6d2b285fa3..035dfc0d08 100644
--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node-v14
@@ -1,24 +1,26 @@
-ARG TAG=pinned
-# apparently, ARGs don't get replaced in RUN commands in kaniko
-# ARG POSTGIS_VERSION=3.3.0
-# ARG PLV8_VERSION=3.1.4
-# ARG PG_VERSION=v14
+#
+# This file is identical to the Dockerfile.compute-node-v15 file
+# except for the version of Postgres that is built.
+#
 
+ARG TAG=pinned
+
+#########################################################################################
 #
 # Layer "build-deps"
 #
+#########################################################################################
 FROM debian:bullseye-slim AS build-deps
-RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \
-    apt update
 RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-    libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev
+    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
+    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config
 
+#########################################################################################
 #
 # Layer "pg-build"
 # Build Postgres from the neon postgres repository.
 #
+#########################################################################################
 FROM build-deps AS pg-build
 COPY vendor/postgres-v14 postgres
 RUN cd postgres && \
@@ -29,22 +31,20 @@ RUN cd postgres && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
 
+#########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
-# PostGIS compiles against neon postgres sources without changes. Perhaps we
-# could even use the upstream binaries, compiled against vanilla Postgres, but
-# it would require some investigation to check that it works, and also keeps
-# working in the future. So for now, we compile our own binaries.
+#########################################################################################
 FROM build-deps AS postgis-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
     apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc
 
-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
-    tar xvzf postgis-3.3.0.tar.gz && \
-    cd postgis-3.3.0 && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
+    tar xvzf postgis-3.3.1.tar.gz && \
+    cd postgis-3.3.1 && \
     ./autogen.sh && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     ./configure && \
@@ -57,19 +57,29 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.0.tar.gz && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
 
+#########################################################################################
 #
 # Layer "plv8-build"
 # Build plv8
 #
+#########################################################################################
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5
+    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
 
-# https://github.com/plv8/plv8/issues/475
-# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing binutils
+# https://github.com/plv8/plv8/issues/475:
+#   v8 uses gold for linking and sets `--thread-count=4` which breaks
+#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
+# Install newer gold version manually as debian-testing binutils version updates
+# libc version, which in turn breaks other extension built against non-testing libc.
+RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
+    tar xvzf binutils-2.38.tar.gz && \
+    cd binutils-2.38 && \
+    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
+    cd ../bfd && ./configure && make bfdver.h && \
+    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
+    cp /usr/local/bin/ld.gold /usr/bin/gold
 
 # Sed is used to patch for https://github.com/plv8/plv8/issues/503
 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
@@ -77,21 +87,25 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
     cd plv8-3.1.4 && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
     rm -rf /plv8-* && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
 
+#########################################################################################
 #
 # Layer "h3-pg-build"
 # Build h3_pg
 #
+#########################################################################################
 FROM build-deps AS h3-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # packaged cmake is too old
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing cmake
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
+      -q -O /tmp/cmake-install.sh \
+      && chmod u+x /tmp/cmake-install.sh \
+      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
+      && rm /tmp/cmake-install.sh
 
 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
     tar xvzf h3.tgz  && \
@@ -110,12 +124,15 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
 
+#########################################################################################
 #
 # Layer "neon-pg-ext-build"
 # compile neon extensions
 #
+#########################################################################################
 FROM build-deps AS neon-pg-ext-build
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -128,16 +145,22 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
         -C pgxn/neon \
         -s install
 
+#########################################################################################
+#
 # Compile and run the Neon-specific `compute_ctl` binary
+#
+#########################################################################################
 FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
 RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
 
+#########################################################################################
 #
 # Clean up postgres folder before inclusion
 #
+#########################################################################################
 FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
 
@@ -155,10 +178,12 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a
 
+#########################################################################################
 #
 # Final layer
 # Put it all together into the final image
 #
+#########################################################################################
 FROM debian:bullseye-slim
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
@@ -175,8 +200,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libreadline8 for psql
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
-# GLIBC 2.34 for plv8.
-#     Debian bullseye provides GLIBC 2.31, so we install the library from testing
 #
 # Lastly, link compute_ctl into zenith_ctl while we're at it,
 # so that we don't need to put this in another layer.
@@ -189,12 +212,6 @@ RUN apt update &&  \
         libproj19 \
         libprotobuf-c1 && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
-    echo "Installing GLIBC 2.34" && \
-    echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \
-    apt update && \
-    apt install -y --no-install-recommends -t testing libc6 && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
 
 USER postgres
diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15
index b7b1f25103..0b6e570b44 100644
--- a/Dockerfile.compute-node-v15
+++ b/Dockerfile.compute-node-v15
@@ -4,26 +4,23 @@
 #
 
 ARG TAG=pinned
-# apparently, ARGs don't get replaced in RUN commands in kaniko
-# ARG POSTGIS_VERSION=3.3.1
-# ARG PLV8_VERSION=3.1.4
-# ARG PG_VERSION=v15
 
+#########################################################################################
 #
 # Layer "build-deps"
 #
+#########################################################################################
 FROM debian:bullseye-slim AS build-deps
-RUN echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \
-    apt update
 RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev zlib1g-dev libxml2-dev \
-    libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libglib2.0-dev
+    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
+    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config
 
+#########################################################################################
 #
 # Layer "pg-build"
 # Build Postgres from the neon postgres repository.
 #
+#########################################################################################
 FROM build-deps AS pg-build
 COPY vendor/postgres-v15 postgres
 RUN cd postgres && \
@@ -34,14 +31,12 @@ RUN cd postgres && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install
 
+#########################################################################################
 #
 # Layer "postgis-build"
 # Build PostGIS from the upstream PostGIS mirror.
 #
-# PostGIS compiles against neon postgres sources without changes. Perhaps we
-# could even use the upstream binaries, compiled against vanilla Postgres, but
-# it would require some investigation to check that it works, and also keeps
-# working in the future. So for now, we compile our own binaries.
+#########################################################################################
 FROM build-deps AS postgis-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
@@ -62,19 +57,29 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control
 
+#########################################################################################
 #
 # Layer "plv8-build"
 # Build plv8
 #
+#########################################################################################
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5
+    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
 
-# https://github.com/plv8/plv8/issues/475
-# Debian bullseye provides binutils 2.35 when >= 2.38 is necessary
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing binutils
+# https://github.com/plv8/plv8/issues/475:
+#   v8 uses gold for linking and sets `--thread-count=4` which breaks
+#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
+# Install newer gold version manually as debian-testing binutils version updates
+# libc version, which in turn breaks other extension built against non-testing libc.
+RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
+    tar xvzf binutils-2.38.tar.gz && \
+    cd binutils-2.38 && \
+    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
+    cd ../bfd && ./configure && make bfdver.h && \
+    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
+    cp /usr/local/bin/ld.gold /usr/bin/gold
 
 # Sed is used to patch for https://github.com/plv8/plv8/issues/503
 RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
@@ -82,21 +87,25 @@ RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
     cd plv8-3.1.4 && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
     rm -rf /plv8-* && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
 
+#########################################################################################
 #
 # Layer "h3-pg-build"
 # Build h3_pg
 #
+#########################################################################################
 FROM build-deps AS h3-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # packaged cmake is too old
-RUN apt update && \
-    apt install -y --no-install-recommends -t testing cmake
+RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
+      -q -O /tmp/cmake-install.sh \
+      && chmod u+x /tmp/cmake-install.sh \
+      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
+      && rm /tmp/cmake-install.sh
 
 RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
     tar xvzf h3.tgz  && \
@@ -115,12 +124,15 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
 
+#########################################################################################
 #
 # Layer "neon-pg-ext-build"
 # compile neon extensions
 #
+#########################################################################################
 FROM build-deps AS neon-pg-ext-build
 COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -133,16 +145,22 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
         -C pgxn/neon \
         -s install
 
+#########################################################################################
+#
 # Compile and run the Neon-specific `compute_ctl` binary
+#
+#########################################################################################
 FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
 RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
 
+#########################################################################################
 #
 # Clean up postgres folder before inclusion
 #
+#########################################################################################
 FROM neon-pg-ext-build AS postgres-cleanup-layer
 COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
 
@@ -160,10 +178,12 @@ RUN rm -r /usr/local/pgsql/lib/pgxs/src
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a
 
+#########################################################################################
 #
 # Final layer
 # Put it all together into the final image
 #
+#########################################################################################
 FROM debian:bullseye-slim
 # Add user postgres
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
@@ -180,8 +200,6 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libreadline8 for psql
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
-# GLIBC 2.34 for plv8.
-#     Debian bullseye provides GLIBC 2.31, so we install the library from testing
 #
 # Lastly, link compute_ctl into zenith_ctl while we're at it,
 # so that we don't need to put this in another layer.
@@ -194,12 +212,6 @@ RUN apt update &&  \
         libproj19 \
         libprotobuf-c1 && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
-    echo "Installing GLIBC 2.34" && \
-    echo "deb http://ftp.debian.org/debian testing main" >> /etc/apt/sources.list && \
-    echo "Package: *\nPin: release n=bullseye\nPin-Priority: 50" > /etc/apt/preferences && \
-    apt update && \
-    apt install -y --no-install-recommends -t testing libc6 && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     ln /usr/local/bin/compute_ctl /usr/local/bin/zenith_ctl
 
 USER postgres

From 8e5bb3ed4989bbf86ff8df16b145e27f28e8e9e7 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 27 Oct 2022 11:09:09 +0400
Subject: [PATCH 012/209] Enable etcd compaction in neon_local.

---
 control_plane/src/etcd.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/control_plane/src/etcd.rs b/control_plane/src/etcd.rs
index ccadfa8ce7..ca2df8a50b 100644
--- a/control_plane/src/etcd.rs
+++ b/control_plane/src/etcd.rs
@@ -52,6 +52,10 @@ pub fn start_etcd_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
             // size smaller. Our test etcd clusters are very small.
             // See https://github.com/etcd-io/etcd/issues/7910
             "--quota-backend-bytes=100000000".to_string(),
+            // etcd doesn't compact (vacuum) with default settings,
+            // enable it to prevent space exhaustion.
+            "--auto-compaction-mode=revision".to_string(),
+            "--auto-compaction-retention=1".to_string(),
         ])
         .stdout(Stdio::from(etcd_stdout_file))
         .stderr(Stdio::from(etcd_stderr_file))

From 0816168296883b086dbc50af0b4ebc9fbdf3afe7 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Thu, 15 Dec 2022 12:36:57 +0400
Subject: [PATCH 013/209] Hotfix: terminate subscription if channel is full.

Might help as a hotfix, but need to understand root better.
---
 storage_broker/src/bin/storage_broker.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index fdf2637b4d..1a743394ad 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -369,8 +369,9 @@ impl BrokerService for Broker {
                     Err(RecvError::Lagged(skipped_msg)) => {
                         missed_msgs += skipped_msg;
                         if let Poll::Ready(_) = futures::poll!(Box::pin(warn_interval.tick())) {
-                            warn!("dropped {} messages, channel is full", missed_msgs);
-                            missed_msgs = 0;
+                            error!("subscription id={}, key={:?}, addr={:?} dropped {} messages, channel is full",
+                                subscriber.id, subscriber.key, subscriber.remote_addr, missed_msgs);
+                            Err(Status::new(Code::Internal, "full channel"))?;
                         }
                     }
                     Err(RecvError::Closed) => {

From d24de169a7ac1de84f08a420295e309a5946f893 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 16 Dec 2022 00:57:59 +0400
Subject: [PATCH 014/209] Deploy broker with L4 LB in new env.

Seems to be fixing issue with missing keepalives.
---
 .../ansible/prod.ap-southeast-1.hosts.yaml    |  2 +-
 .github/ansible/prod.eu-central-1.hosts.yaml  |  2 +-
 .github/ansible/prod.us-east-2.hosts.yaml     |  2 +-
 .github/ansible/prod.us-west-2.hosts.yaml     |  2 +-
 .github/ansible/staging.eu-west-1.hosts.yaml  |  2 +-
 .github/ansible/staging.us-east-2.hosts.yaml  |  2 +-
 ...ev-eu-west-1-zeta.neon-storage-broker.yaml | 33 ++++++++-----------
 ...ev-us-east-2-beta.neon-storage-broker.yaml | 33 ++++++++-----------
 ...utheast-1-epsilon.neon-storage-broker.yaml | 33 ++++++++-----------
 ...u-central-1-gamma.neon-storage-broker.yaml | 33 ++++++++-----------
 ...d-us-east-2-delta.neon-storage-broker.yaml | 33 ++++++++-----------
 ...rod-us-west-2-eta.neon-storage-broker.yaml | 33 ++++++++-----------
 .github/workflows/build_and_test.yml          |  4 +--
 13 files changed, 92 insertions(+), 122 deletions(-)

diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml
index bcc7bb3b16..648029c120 100644
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-ap-southeast-1
     bucket_region: ap-southeast-1
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml
index 2b372d0fcb..c285a9f3b6 100644
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-eu-central-1
     bucket_region: eu-central-1
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.gamma.eu-central-1.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml
index 7a4002ec88..1753068b8c 100644
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ b/.github/ansible/prod.us-east-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-us-east-2
     bucket_region: us-east-2
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.delta.us-east-2.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index 682ee5994d..7d6e49bf9c 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-prod-storage-us-west-2
     bucket_region: us-west-2
     console_mgmt_base_url: http://console-release.local
-    broker_endpoint: https://storage-broker.eta.us-west-2.internal.aws.neon.tech:443
+    broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml
index 90f00175b0..cfcc3a9ae8 100644
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-dev-storage-eu-west-1
     bucket_region: eu-west-1
     console_mgmt_base_url: http://console-staging.local
-    broker_endpoint: https://storage-broker.zeta.eu-west-1.internal.aws.neon.build:443
+    broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index d2b7fae12a..78a4582e57 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -3,7 +3,7 @@ storage:
     bucket_name: neon-staging-storage-us-east-2
     bucket_region: us-east-2
     console_mgmt_base_url: http://console-staging.local
-    broker_endpoint: https://storage-broker.beta.us-east-2.internal.aws.neon.build:443
+    broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
       remote_storage:
diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
index e876367a18..c6e682f571 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: staging
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.zeta.eu-west-1.internal.aws.neon.build
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.zeta.eu-west-1.internal.aws.neon.build
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
index dcf4b99de2..c7682d24c0 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: staging
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.beta.us-east-2.internal.aws.neon.build
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.beta.us-east-2.internal.aws.neon.build
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.beta.us-east-2.internal.aws.neon.build
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
index 0abc6ebaa1..92b1777d0b 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.epsilon.ap-southeast-1.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
index d44a3eab5c..f89df4533a 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.gamma.eu-central-1.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.gamma.eu-central-1.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
index b9eeff5681..8cbc1af7cf 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.delta.us-east-2.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.delta.us-east-2.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.delta.us-east-2.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
index 249f76303a..8a7488948d 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-storage-broker.yaml
@@ -3,27 +3,22 @@ podLabels:
   neon_env: production
   neon_service: storage-broker
 
-ingress:
-  enabled: true
+# Use L4 LB
+service:
+  # service.annotations -- Annotations to add to the service
   annotations:
-    kubernetes.io/ingress.class: nginx-internal
-    nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
-    nginx.ingress.kubernetes.io/ssl-redirect: "true"
-    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
-    # we have basically infinite streams, disable body size limit
-    nginx.ingress.kubernetes.io/proxy-body-size: "0"
-    cert-manager.io/cluster-issuer: "cert-manager-clusterissuer"
-
-  hosts:
-    - host: storage-broker.eta.us-west-2.internal.aws.neon.tech
-      paths:
-        - path: /
-          pathType: Prefix
-  tls:
-    - hosts:
-        - storage-broker.eta.us-west-2.internal.aws.neon.tech
-      secretName: storage-broker-tls
+    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
+    # assign service to this name at external-dns
+    external-dns.alpha.kubernetes.io/hostname: storage-broker-lb.eta.us-west-2.internal.aws.neon.tech
+  # service.type -- Service type
+  type: LoadBalancer
+  # service.port -- broker listen port
+  port: 50051
 
+ingress:
+  enabled: false
 
 metrics:
   enabled: false
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7a887cbece..43b855a2b0 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1072,7 +1072,7 @@ jobs:
 
       - name: Deploy storage-broker
         run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
 
   deploy-proxy-prod-new:
     runs-on: prod
@@ -1149,7 +1149,7 @@ jobs:
 
       - name: Deploy storage-broker
         run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
 
   promote-compatibility-data:
     runs-on: [ self-hosted, dev, x64 ]

From 73ea0a0b0188cfa40d09fb458e91f3cb38ce7425 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 16 Dec 2022 13:40:01 +0200
Subject: [PATCH 015/209] fix(remote_storage): use cached credentials (#3128)

IMDSv2 has limits, and if we query it on every s3 interaction we are
going to go over those limits. Changes the s3_bucket client
configuration to use:
- ChainCredentialsProvider to handle env variables or imds usage
- LazyCachingCredentialsProvider to actually cache any credentials

Related: https://github.com/awslabs/aws-sdk-rust/issues/629
Possibly related: https://github.com/neondatabase/neon/issues/3118
---
 libs/remote_storage/src/s3_bucket.rs | 47 +++++++++++-----------------
 1 file changed, 18 insertions(+), 29 deletions(-)

diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index ab1e5da6c5..740f3753d8 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,14 +4,13 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.
 
-use std::env::var;
 use std::sync::Arc;
-use std::time::Duration;
 
 use anyhow::Context;
 use aws_config::{
-    environment::credentials::EnvironmentVariableCredentialsProvider, imds,
-    imds::credentials::ImdsCredentialsProvider, meta::credentials::provide_credentials_fn,
+    environment::credentials::EnvironmentVariableCredentialsProvider,
+    imds::credentials::ImdsCredentialsProvider,
+    meta::credentials::{CredentialsProviderChain, LazyCachingCredentialsProvider},
 };
 use aws_sdk_s3::{
     config::Config,
@@ -20,7 +19,6 @@ use aws_sdk_s3::{
     Client, Endpoint, Region,
 };
 use aws_smithy_http::body::SdkBody;
-use aws_types::credentials::{CredentialsError, ProvideCredentials};
 use hyper::Body;
 use tokio::{io, sync::Semaphore};
 use tokio_util::io::ReaderStream;
@@ -31,8 +29,6 @@ use crate::{
     Download, DownloadError, RemotePath, RemoteStorage, S3Config, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 
-const DEFAULT_IMDS_TIMEOUT: Duration = Duration::from_secs(10);
-
 pub(super) mod metrics {
     use metrics::{register_int_counter_vec, IntCounterVec};
     use once_cell::sync::Lazy;
@@ -122,30 +118,23 @@ impl S3Bucket {
             "Creating s3 remote storage for S3 bucket {}",
             aws_config.bucket_name
         );
+
+        let credentials_provider = {
+            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+            let env_creds = EnvironmentVariableCredentialsProvider::new();
+            // uses imds v2
+            let imds = ImdsCredentialsProvider::builder().build();
+
+            // finally add caching.
+            // this might change in future, see https://github.com/awslabs/aws-sdk-rust/issues/629
+            LazyCachingCredentialsProvider::builder()
+                .load(CredentialsProviderChain::first_try("env", env_creds).or_else("imds", imds))
+                .build()
+        };
+
         let mut config_builder = Config::builder()
             .region(Region::new(aws_config.bucket_region.clone()))
-            .credentials_provider(provide_credentials_fn(|| async {
-                match var("AWS_ACCESS_KEY_ID").is_ok() && var("AWS_SECRET_ACCESS_KEY").is_ok() {
-                    true => {
-                        EnvironmentVariableCredentialsProvider::new()
-                            .provide_credentials()
-                            .await
-                    }
-                    false => {
-                        let imds_client = imds::Client::builder()
-                            .connect_timeout(DEFAULT_IMDS_TIMEOUT)
-                            .read_timeout(DEFAULT_IMDS_TIMEOUT)
-                            .build()
-                            .await
-                            .map_err(CredentialsError::unhandled)?;
-                        ImdsCredentialsProvider::builder()
-                            .imds_client(imds_client)
-                            .build()
-                            .provide_credentials()
-                            .await
-                    }
-                }
-            }));
+            .credentials_provider(credentials_provider);
 
         if let Some(custom_endpoint) = aws_config.endpoint.clone() {
             let endpoint = Endpoint::immutable(

From ece05556002b9c95fb9ad8f0a475ab10468e77fd Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Wed, 14 Dec 2022 21:28:14 +0100
Subject: [PATCH 016/209] Push proxy metrics to Victoria Metrics (#3106)

---
 .../dev-eu-west-1-zeta.neon-proxy-scram.yaml  | 25 +++++++++++++++++++
 .../dev-us-east-2-beta.neon-proxy-link.yaml   | 25 +++++++++++++++++++
 ...s-east-2-beta.neon-proxy-scram-legacy.yaml | 25 +++++++++++++++++++
 .../dev-us-east-2-beta.neon-proxy-scram.yaml  | 25 +++++++++++++++++++
 .../helm-values/neon-stress.proxy-scram.yaml  | 25 +++++++++++++++++++
 .github/helm-values/neon-stress.proxy.yaml    | 25 +++++++++++++++++++
 ...-southeast-1-epsilon.neon-proxy-scram.yaml | 25 +++++++++++++++++++
 ...d-eu-central-1-gamma.neon-proxy-scram.yaml | 25 +++++++++++++++++++
 ...prod-us-east-2-delta.neon-proxy-scram.yaml | 25 +++++++++++++++++++
 .../prod-us-west-2-eta.neon-proxy-scram.yaml  | 25 +++++++++++++++++++
 .../helm-values/production.proxy-scram.yaml   | 25 +++++++++++++++++++
 .github/helm-values/production.proxy.yaml     | 25 +++++++++++++++++++
 .github/helm-values/staging.proxy-scram.yaml  | 25 +++++++++++++++++++
 .github/helm-values/staging.proxy.yaml        | 25 +++++++++++++++++++
 14 files changed, 350 insertions(+)

diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
index f89eea5972..ae9c1f2e40 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
index eeb025277b..093fac146a 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -38,3 +38,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
index ed710bc196..a2f932e4fb 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
index ba0109c1eb..1138536e94 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/neon-stress.proxy-scram.yaml b/.github/helm-values/neon-stress.proxy-scram.yaml
index dea47304a0..ed580349fc 100644
--- a/.github/helm-values/neon-stress.proxy-scram.yaml
+++ b/.github/helm-values/neon-stress.proxy-scram.yaml
@@ -25,3 +25,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/neon-stress.proxy.yaml b/.github/helm-values/neon-stress.proxy.yaml
index c3ecf6c743..94270ced09 100644
--- a/.github/helm-values/neon-stress.proxy.yaml
+++ b/.github/helm-values/neon-stress.proxy.yaml
@@ -34,3 +34,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
index a37a37406c..4e4aff1f9e 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
index 69d00a7e9c..94290a87e1 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
index 19d91fa4dc..1a4023708b 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
index f148188c48..2942d6a2aa 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -30,3 +30,28 @@ exposedService:
 #    enabled: true
 #    selector:
 #      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/production.proxy-scram.yaml b/.github/helm-values/production.proxy-scram.yaml
index 399bc6d21b..c7143cd61a 100644
--- a/.github/helm-values/production.proxy-scram.yaml
+++ b/.github/helm-values/production.proxy-scram.yaml
@@ -23,3 +23,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/production.proxy.yaml b/.github/helm-values/production.proxy.yaml
index 9db68c1044..dbaf3cd096 100644
--- a/.github/helm-values/production.proxy.yaml
+++ b/.github/helm-values/production.proxy.yaml
@@ -32,3 +32,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/staging.proxy-scram.yaml b/.github/helm-values/staging.proxy-scram.yaml
index f249df3612..66f9921c9a 100644
--- a/.github/helm-values/staging.proxy-scram.yaml
+++ b/.github/helm-values/staging.proxy-scram.yaml
@@ -30,3 +30,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/helm-values/staging.proxy.yaml b/.github/helm-values/staging.proxy.yaml
index 62b4c4a595..a22082e625 100644
--- a/.github/helm-values/staging.proxy.yaml
+++ b/.github/helm-values/staging.proxy.yaml
@@ -30,3 +30,28 @@ metrics:
     enabled: true
     selector:
       release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"

From f759b561f3f7489b536c8ed4638e34ea5c73c91a Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Thu, 29 Dec 2022 16:08:21 +0200
Subject: [PATCH 017/209] add pageserver to new region see
 https://github.com/neondatabase/aws/pull/116

---
 .github/ansible/prod.us-west-2.hosts.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index 7d6e49bf9c..9eb422a3ae 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -25,6 +25,8 @@ storage:
           ansible_host: i-0d9f6dfae0e1c780d 
         pageserver-1.us-west-2.aws.neon.tech:
           ansible_host: i-0c834be1dddba8b3f
+        pageserver-2.us-west-2.aws.neon.tech:
+          ansible_host: i-051642d372c0a4f32
 
     safekeepers:
       hosts:

From 06d25f2186cc4b525dc138a8d193da92001f0e96 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Wed, 28 Dec 2022 17:48:49 +0200
Subject: [PATCH 018/209] switch to debug from info to produce less noise

---
 pageserver/src/storage_sync2.rs   | 6 +++---
 pageserver/src/tenant/timeline.rs | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/storage_sync2.rs b/pageserver/src/storage_sync2.rs
index 7cc0eac2bf..55dbeaff73 100644
--- a/pageserver/src/storage_sync2.rs
+++ b/pageserver/src/storage_sync2.rs
@@ -203,7 +203,7 @@ use std::sync::{Arc, Mutex};
 use anyhow::ensure;
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use tokio::runtime::Runtime;
-use tracing::{info, warn};
+use tracing::{debug, info, warn};
 use tracing::{info_span, Instrument};
 
 use utils::lsn::Lsn;
@@ -747,7 +747,7 @@ impl RemoteTimelineClient {
             // We can launch this task. Remove it from the queue first.
             let next_op = upload_queue.queued_operations.pop_front().unwrap();
 
-            info!("starting op: {}", next_op);
+            debug!("starting op: {}", next_op);
 
             // Update the counters
             match next_op {
@@ -930,7 +930,7 @@ impl RemoteTimelineClient {
                 task.op, retries
             );
         } else {
-            info!("remote task {} completed successfully", task.op);
+            debug!("remote task {} completed successfully", task.op);
         }
 
         // The task has completed succesfully. Remove it from the in-progress list.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a746fd9bf8..cd045d1081 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2288,7 +2288,7 @@ impl Timeline {
         // See storage_sync module level comment on consistency.
         // Do it here because we don't want to hold self.layers.write() while waiting.
         if let Some(remote_client) = &self.remote_client {
-            info!("waiting for upload ops to complete");
+            debug!("waiting for upload ops to complete");
             remote_client
                 .wait_completion()
                 .await
@@ -2499,7 +2499,7 @@ impl Timeline {
         // See storage_sync module level comment on consistency.
         // Do it here because we don't want to hold self.layers.write() while waiting.
         if let Some(remote_client) = &self.remote_client {
-            info!("waiting for upload ops to complete");
+            debug!("waiting for upload ops to complete");
             remote_client
                 .wait_completion()
                 .await

From cd01bbc715e9ac594af29f6a657430e7a0703877 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Thu, 22 Dec 2022 19:21:53 +0400
Subject: [PATCH 019/209] Move zenith-1-sk-3 to zenith-1-sk-4 (#3164)

---
 .github/ansible/production.hosts.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml
index d22c845966..3122a43801 100644
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -34,5 +34,5 @@ storage:
           console_region_id: aws-us-west-2
         zenith-1-sk-2:
           console_region_id: aws-us-west-2
-        zenith-1-sk-3:
+        zenith-1-sk-4:
           console_region_id: aws-us-west-2

From d90c5a03af3763949442686e118581e5cdd4dd90 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Tue, 17 Jan 2023 22:07:38 +0200
Subject: [PATCH 020/209] Add more io::Error context when fail to operate on a
 path (#3254)

I have a test failure that shows

```
Caused by:
    0: Failed to reconstruct a page image:
    1: Directory not empty (os error 39)
```

but does not really show where exactly that happens.

https://neon-github-public-dev.s3.amazonaws.com/reports/pr-3227/release/3823785365/index.html#categories/c0057473fc9ec8fb70876fd29a171ce8/7088dab272f2c7b7/?attachment=60fe6ed2add4d82d

The PR aims to add more context in debugging that issue.
---
 pageserver/src/walredo.rs | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index a552c05d63..fd0524016f 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -626,24 +626,20 @@ impl PostgresRedoProcess {
 
         // Create empty data directory for wal-redo postgres, deleting old one first.
         if datadir.exists() {
-            info!(
-                "old temporary datadir {} exists, removing",
-                datadir.display()
-            );
-            fs::remove_dir_all(&datadir)?;
+            info!("old temporary datadir {datadir:?} exists, removing");
+            fs::remove_dir_all(&datadir).map_err(|e| {
+                Error::new(
+                    e.kind(),
+                    format!("Old temporary dir {datadir:?} removal failure: {e}"),
+                )
+            })?;
         }
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).map_err(|e| {
-            Error::new(
-                ErrorKind::Other,
-                format!("incorrect pg_bin_dir path: {}", e),
-            )
-        })?;
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).map_err(|e| {
-            Error::new(
-                ErrorKind::Other,
-                format!("incorrect pg_lib_dir path: {}", e),
-            )
-        })?;
+        let pg_bin_dir_path = conf
+            .pg_bin_dir(pg_version)
+            .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_bin_dir path: {e}")))?;
+        let pg_lib_dir_path = conf
+            .pg_lib_dir(pg_version)
+            .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_lib_dir path: {e}")))?;
 
         info!("running initdb in {}", datadir.display());
         let initdb = Command::new(pg_bin_dir_path.join("initdb"))

From bd535b3371f4e45f13a3f9abbacc3efbf931616f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Jan 2023 02:29:05 +0200
Subject: [PATCH 021/209] If an error happens while checking for core dumps,
 don't panic.

If we panic, we skip the 30s wait in 'main', and don't give the
console a chance to observe the error. Which is not nice.

Spotted by @ololobus at
https://github.com/neondatabase/neon/pull/3352#discussion_r1072806981
---
 compute_tools/src/compute.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index c2c9ab2230..d652084e00 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -23,7 +23,7 @@ use std::sync::RwLock;
 
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use log::{info, warn};
+use log::{error, info, warn};
 use postgres::{Client, NoTls};
 use serde::{Serialize, Serializer};
 
@@ -311,8 +311,9 @@ impl ComputeNode {
             .wait()
             .expect("failed to start waiting on Postgres process");
 
-        self.check_for_core_dumps()
-            .expect("failed to check for core dumps");
+        if let Err(err) = self.check_for_core_dumps() {
+            error!("error while checking for core dumps: {err:?}");
+        }
 
         Ok(ecode)
     }

From 4992160677509996064df4f3dacc79c64fe2c9c2 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Wed, 18 Jan 2023 14:58:55 +0200
Subject: [PATCH 022/209] Fix metric_collection_endpoint for prod. It was
 incorrectly set to staging url

---
 .github/ansible/production.hosts.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml
index 22bace5ade..ecb847bd61 100644
--- a/.github/ansible/production.hosts.yaml
+++ b/.github/ansible/production.hosts.yaml
@@ -7,7 +7,7 @@ storage:
     broker_endpoint: http://storage-broker.prod.local:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
       metric_collection_interval: 10min
       remote_storage:
         bucket_name: "{{ bucket_name }}"

From c85374295fd38e081c1a683281016355471852bd Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Wed, 18 Jan 2023 14:49:59 +0100
Subject: [PATCH 023/209] Change SENTRY_ENVIRONMENT from "development" to
 "staging"

---
 .github/ansible/staging.eu-west-1.hosts.yaml                    | 2 +-
 .github/ansible/staging.us-east-2.hosts.yaml                    | 2 +-
 .github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml    | 2 +-
 .github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml | 2 +-
 .github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml     | 2 +-
 .../helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml | 2 +-
 .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml    | 2 +-
 .github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml
index fce450ed39..f28dc8e07b 100644
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -18,7 +18,7 @@ storage:
     ansible_aws_ssm_region: eu-west-1
     ansible_aws_ssm_bucket_name: neon-dev-storage-eu-west-1
     console_region_id: aws-eu-west-1
-    sentry_environment: development
+    sentry_environment: staging
 
   children:
     pageservers:
diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index 1d1b8dbfa4..4891875369 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -18,7 +18,7 @@ storage:
     ansible_aws_ssm_region: us-east-2
     ansible_aws_ssm_bucket_name: neon-staging-storage-us-east-2
     console_region_id: aws-us-east-2
-    sentry_environment: development
+    sentry_environment: staging
 
   children:
     pageservers:
diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
index 47924456ba..c49b8d2009 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -8,7 +8,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.eu-west-1.aws.neon.build"
-  sentryEnvironment: "development"
+  sentryEnvironment: "staging"
   wssPort: 8443
   metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
index c6e682f571..ccf701f52d 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-storage-broker.yaml
@@ -49,4 +49,4 @@ extraManifests:
           - "{{ .Release.Namespace }}"
 
 settings:
-  sentryEnvironment: "development"
+  sentryEnvironment: "staging"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
index eb8fd50c0f..cb062f705d 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -8,7 +8,7 @@ settings:
   authBackend: "link"
   authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
   uri: "https://console.stage.neon.tech/psql_session/"
-  sentryEnvironment: "development"
+  sentryEnvironment: "staging"
   metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
 
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
index 8a08738d5f..99b67d75c1 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -8,7 +8,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.cloud.stage.neon.tech"
-  sentryEnvironment: "development"
+  sentryEnvironment: "staging"
   wssPort: 8443
   metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
index b02d46917c..764bb25b64 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -8,7 +8,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://console-staging.local/management/api/v2"
   domain: "*.us-east-2.aws.neon.build"
-  sentryEnvironment: "development"
+  sentryEnvironment: "staging"
   wssPort: 8443
   metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
index c7682d24c0..69363c5f13 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-storage-broker.yaml
@@ -49,4 +49,4 @@ extraManifests:
           - "{{ .Release.Namespace }}"
 
 settings:
-  sentryEnvironment: "development"
+  sentryEnvironment: "staging"

From cb356f325978241e20a82703b04b784cdf8bb605 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Sat, 7 Jan 2023 00:23:35 +0200
Subject: [PATCH 024/209] Use actual temporary dir for pageserver unit tests

---
 .gitignore                                    |   2 -
 control_plane/.gitignore                      |   1 -
 pageserver/src/config.rs                      |   5 -
 pageserver/src/tenant.rs                      | 105 ++++++++----------
 pageserver/src/tenant/ephemeral_file.rs       |  36 +++---
 .../src/tenant/remote_timeline_client.rs      |   2 +-
 pageserver/src/virtual_file.rs                |   8 +-
 pageserver/src/walingest.rs                   |  12 +-
 .../src/walreceiver/connection_manager.rs     |  16 +--
 test_runner/sql_regress/.gitignore            |   1 -
 10 files changed, 80 insertions(+), 108 deletions(-)
 delete mode 100644 control_plane/.gitignore

diff --git a/.gitignore b/.gitignore
index f1afdee599..2e241ee8cd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,5 @@
 /pg_install
 /target
-/tmp_check
-/tmp_check_cli
 __pycache__/
 test_output/
 .vscode
diff --git a/control_plane/.gitignore b/control_plane/.gitignore
deleted file mode 100644
index c1e54a6bcb..0000000000
--- a/control_plane/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-tmp_check/
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 51d1664e52..f3e5fb8c1a 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -693,11 +693,6 @@ impl PageServerConf {
         Ok(t_conf)
     }
 
-    #[cfg(test)]
-    pub fn test_repo_dir(test_name: &str) -> PathBuf {
-        PathBuf::from(format!("../tmp_check/test_{test_name}"))
-    }
-
     pub fn dummy_conf(repo_dir: PathBuf) -> Self {
         let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1d0d6b66ab..0dd6735993 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2626,9 +2626,9 @@ where
 #[cfg(test)]
 pub mod harness {
     use bytes::{Bytes, BytesMut};
-    use once_cell::sync::Lazy;
-    use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
+    use std::sync::Arc;
     use std::{fs, path::PathBuf};
+    use tempfile::TempDir;
     use utils::lsn::Lsn;
 
     use crate::{
@@ -2659,8 +2659,6 @@ pub mod harness {
         buf.freeze()
     }
 
-    static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));
-
     impl From<TenantConf> for TenantConfOpt {
         fn from(tenant_conf: TenantConf) -> Self {
             Self {
@@ -2681,36 +2679,27 @@ pub mod harness {
         }
     }
 
-    pub struct TenantHarness<'a> {
+    /// The harness saves some boilerplate and provides a way to create functional tenant
+    /// without running pageserver binary. It uses temporary directory to store data in it.
+    /// Tempdir gets removed on harness drop.
+    pub struct TenantHarness {
+        // keep the struct to not to remove tmp dir during the test
+        _temp_repo_dir: TempDir,
         pub conf: &'static PageServerConf,
         pub tenant_conf: TenantConf,
         pub tenant_id: TenantId,
-
-        pub lock_guard: (
-            Option<RwLockReadGuard<'a, ()>>,
-            Option<RwLockWriteGuard<'a, ()>>,
-        ),
     }
 
-    impl<'a> TenantHarness<'a> {
-        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
-            Self::create_internal(test_name, false)
-        }
-        pub fn create_exclusive(test_name: &'static str) -> anyhow::Result<Self> {
-            Self::create_internal(test_name, true)
-        }
-        fn create_internal(test_name: &'static str, exclusive: bool) -> anyhow::Result<Self> {
-            let lock_guard = if exclusive {
-                (None, Some(LOCK.write().unwrap()))
-            } else {
-                (Some(LOCK.read().unwrap()), None)
-            };
+    static LOG_HANDLE: OnceCell<()> = OnceCell::new();
 
-            let repo_dir = PageServerConf::test_repo_dir(test_name);
-            let _ = fs::remove_dir_all(&repo_dir);
-            fs::create_dir_all(&repo_dir)?;
+    impl TenantHarness {
+        pub fn new() -> anyhow::Result<Self> {
+            let temp_repo_dir = tempfile::tempdir()?;
+            // `TempDir` uses a randomly generated subdirectory of a system tmp dir,
+            // so far it's enough to take care of concurrently running tests.
+            let repo_dir = temp_repo_dir.path();
 
-            let conf = PageServerConf::dummy_conf(repo_dir);
+            let conf = PageServerConf::dummy_conf(repo_dir.to_path_buf());
             // Make a static copy of the config. This can never be free'd, but that's
             // OK in a test.
             let conf: &'static PageServerConf = Box::leak(Box::new(conf));
@@ -2728,10 +2717,10 @@ pub mod harness {
             fs::create_dir_all(conf.timelines_path(&tenant_id))?;
 
             Ok(Self {
+                _temp_repo_dir: temp_repo_dir,
                 conf,
                 tenant_conf,
                 tenant_id,
-                lock_guard,
             })
         }
 
@@ -2825,7 +2814,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_basic() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_basic")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -2858,9 +2848,8 @@ mod tests {
 
     #[tokio::test]
     async fn no_duplicate_timelines() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("no_duplicate_timelines")?
-            .load()
-            .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let _ = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -2891,7 +2880,8 @@ mod tests {
     ///
     #[tokio::test]
     async fn test_branch() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_branch")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -2988,10 +2978,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
-        let tenant =
-            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
-                .load()
-                .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3026,9 +3014,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
-            .load()
-            .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
 
         tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?
@@ -3077,9 +3064,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
-            .load()
-            .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3101,9 +3087,8 @@ mod tests {
     }
     #[tokio::test]
     async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
-            .load()
-            .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3134,8 +3119,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeline_load() -> anyhow::Result<()> {
-        const TEST_NAME: &str = "timeline_load";
-        let harness = TenantHarness::create(TEST_NAME)?;
+        let harness = TenantHarness::new()?;
         {
             let tenant = harness.load().await;
             let tline = tenant
@@ -3154,8 +3138,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
-        const TEST_NAME: &str = "timeline_load_with_ancestor";
-        let harness = TenantHarness::create(TEST_NAME)?;
+        let harness = TenantHarness::new()?;
         // create two timelines
         {
             let tenant = harness.load().await;
@@ -3193,8 +3176,7 @@ mod tests {
 
     #[tokio::test]
     async fn corrupt_metadata() -> anyhow::Result<()> {
-        const TEST_NAME: &str = "corrupt_metadata";
-        let harness = TenantHarness::create(TEST_NAME)?;
+        let harness = TenantHarness::new()?;
         let tenant = harness.load().await;
 
         tenant
@@ -3235,7 +3217,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_images() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_images")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3302,7 +3285,8 @@ mod tests {
     //
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_bulk_insert")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3346,7 +3330,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_random_updates")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3419,9 +3404,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_branches() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_traverse_branches")?
-            .load()
-            .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let mut tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3505,9 +3489,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_ancestors() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_traverse_ancestors")?
-            .load()
-            .await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let mut tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index c433e65ad2..0debeaff1c 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -76,7 +76,7 @@ impl EphemeralFile {
         })
     }
 
-    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), io::Error> {
+    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> io::Result<()> {
         let mut off = 0;
         while off < PAGE_SZ {
             let n = self
@@ -277,7 +277,7 @@ impl Drop for EphemeralFile {
     }
 }
 
-pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
+pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> io::Result<()> {
     if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
         match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
             Ok(_) => Ok(()),
@@ -332,25 +332,17 @@ mod tests {
     use super::*;
     use crate::tenant::blob_io::{BlobCursor, BlobWriter};
     use crate::tenant::block_io::BlockCursor;
+    use crate::tenant::harness::TenantHarness;
     use rand::{seq::SliceRandom, thread_rng, RngCore};
     use std::fs;
     use std::str::FromStr;
 
-    fn harness(
-        test_name: &str,
-    ) -> Result<(&'static PageServerConf, TenantId, TimelineId), io::Error> {
-        let repo_dir = PageServerConf::test_repo_dir(test_name);
-        let _ = fs::remove_dir_all(&repo_dir);
-        let conf = PageServerConf::dummy_conf(repo_dir);
-        // Make a static copy of the config. This can never be free'd, but that's
-        // OK in a test.
-        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
-
-        let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
+    fn harness() -> Result<(TenantHarness, TimelineId), io::Error> {
+        let harness = TenantHarness::new().expect("Failed to create tenant harness");
         let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
-        fs::create_dir_all(conf.timeline_path(&timeline_id, &tenant_id))?;
+        fs::create_dir_all(harness.timeline_path(&timeline_id))?;
 
-        Ok((conf, tenant_id, timeline_id))
+        Ok((harness, timeline_id))
     }
 
     // Helper function to slurp contents of a file, starting at the current position,
@@ -367,10 +359,10 @@ mod tests {
     }
 
     #[test]
-    fn test_ephemeral_files() -> Result<(), io::Error> {
-        let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?;
+    fn test_ephemeral_files() -> io::Result<()> {
+        let (harness, timeline_id) = harness()?;
 
-        let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?;
+        let file_a = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
 
         file_a.write_all_at(b"foo", 0)?;
         assert_eq!("foo", read_string(&file_a, 0, 20)?);
@@ -381,7 +373,7 @@ mod tests {
         // Open a lot of files, enough to cause some page evictions.
         let mut efiles = Vec::new();
         for fileno in 0..100 {
-            let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?;
+            let efile = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
             efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?;
             assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
             efiles.push((fileno, efile));
@@ -398,10 +390,10 @@ mod tests {
     }
 
     #[test]
-    fn test_ephemeral_blobs() -> Result<(), io::Error> {
-        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
+    fn test_ephemeral_blobs() -> io::Result<()> {
+        let (harness, timeline_id) = harness()?;
 
-        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
+        let mut file = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
 
         let pos_foo = file.write_blob(b"foo")?;
         assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 013591caee..58b7eea1eb 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1064,7 +1064,7 @@ mod tests {
     // Test scheduling
     #[test]
     fn upload_scheduling() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("upload_scheduling")?;
+        let harness = TenantHarness::new()?;
         let timeline_path = harness.timeline_path(&TIMELINE_ID);
         std::fs::create_dir_all(&timeline_path)?;
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index fb216123c1..3ad049cc21 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -525,12 +525,13 @@ mod tests {
         })
     }
 
-    fn test_files<OF, FD>(testname: &str, openfunc: OF) -> Result<(), Error>
+    fn test_files<OF, FD>(test_name: &str, openfunc: OF) -> Result<(), Error>
     where
         FD: Read + Write + Seek + FileExt,
         OF: Fn(&Path, &OpenOptions) -> Result<FD, std::io::Error>,
     {
-        let testdir = crate::config::PageServerConf::test_repo_dir(testname);
+        let temp_repo_dir = tempfile::tempdir()?;
+        let testdir = temp_repo_dir.path().join(test_name);
         std::fs::create_dir_all(&testdir)?;
 
         let path_a = testdir.join("file_a");
@@ -632,7 +633,8 @@ mod tests {
         const THREADS: usize = 100;
         const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
 
-        let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
+        let temp_repo_dir = tempfile::tempdir()?;
+        let testdir = temp_repo_dir.path().join("vfile_concurrency");
         std::fs::create_dir_all(&testdir)?;
 
         // Create a test file.
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 0de2e6654d..77fce95160 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1146,7 +1146,8 @@ mod tests {
 
     #[tokio::test]
     async fn test_relsize() -> Result<()> {
-        let tenant = TenantHarness::create("test_relsize")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
@@ -1323,7 +1324,8 @@ mod tests {
     // and then created it again within the same layer.
     #[tokio::test]
     async fn test_drop_extend() -> Result<()> {
-        let tenant = TenantHarness::create("test_drop_extend")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
@@ -1376,7 +1378,8 @@ mod tests {
     // and then extended it again within the same layer.
     #[tokio::test]
     async fn test_truncate_extend() -> Result<()> {
-        let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
@@ -1497,7 +1500,8 @@ mod tests {
     /// split into multiple 1 GB segments in Postgres.
     #[tokio::test]
     async fn test_large_rel() -> Result<()> {
-        let tenant = TenantHarness::create("test_large_rel")?.load().await;
+        let harness = TenantHarness::new()?;
+        let tenant = harness.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs
index 8b60e59305..be58aa0e07 100644
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -846,7 +846,7 @@ mod tests {
 
     #[tokio::test]
     async fn no_connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("no_connection_no_candidate")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -879,7 +879,7 @@ mod tests {
 
     #[tokio::test]
     async fn connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("connection_no_candidate")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -942,7 +942,7 @@ mod tests {
 
     #[tokio::test]
     async fn no_connection_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("no_connection_candidate")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1001,7 +1001,7 @@ mod tests {
 
     #[tokio::test]
     async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("candidate_with_many_connection_failures")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1041,7 +1041,7 @@ mod tests {
 
     #[tokio::test]
     async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
@@ -1105,7 +1105,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
@@ -1166,7 +1166,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?;
+        let harness = TenantHarness::new()?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let new_lsn = Lsn(100_100).align();
@@ -1232,7 +1232,7 @@ mod tests {
 
     const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
 
-    async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
+    async fn dummy_state(harness: &TenantHarness) -> WalreceiverState {
         WalreceiverState {
             id: TenantTimelineId {
                 tenant_id: harness.tenant_id,
diff --git a/test_runner/sql_regress/.gitignore b/test_runner/sql_regress/.gitignore
index 89129d7358..83186b5c86 100644
--- a/test_runner/sql_regress/.gitignore
+++ b/test_runner/sql_regress/.gitignore
@@ -2,7 +2,6 @@
 /pg_regress
 
 # Generated subdirectories
-/tmp_check/
 /results/
 /log/
 

From ffca97bc1e10b6c351067b14bdee6bbc68369c4b Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Thu, 12 Jan 2023 15:14:04 +0200
Subject: [PATCH 025/209] Enable logs in unit tests

---
 libs/utils/src/logging.rs | 2 ++
 pageserver/src/tenant.rs  | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 3b1a1f5aff..82c9267f4a 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -8,6 +8,7 @@ use strum_macros::{EnumString, EnumVariantNames};
 pub enum LogFormat {
     Plain,
     Json,
+    Test,
 }
 
 impl LogFormat {
@@ -39,6 +40,7 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
     match log_format {
         LogFormat::Json => base_logger.json().init(),
         LogFormat::Plain => base_logger.init(),
+        LogFormat::Test => base_logger.with_test_writer().init(),
     }
 
     Ok(())
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0dd6735993..c53c9bc3e1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2626,9 +2626,11 @@ where
 #[cfg(test)]
 pub mod harness {
     use bytes::{Bytes, BytesMut};
+    use once_cell::sync::OnceCell;
     use std::sync::Arc;
     use std::{fs, path::PathBuf};
     use tempfile::TempDir;
+    use utils::logging;
     use utils::lsn::Lsn;
 
     use crate::{
@@ -2694,6 +2696,10 @@ pub mod harness {
 
     impl TenantHarness {
         pub fn new() -> anyhow::Result<Self> {
+            LOG_HANDLE.get_or_init(|| {
+                logging::init(logging::LogFormat::Test).expect("Failed to init test logging")
+            });
+
             let temp_repo_dir = tempfile::tempdir()?;
             // `TempDir` uses a randomly generated subdirectory of a system tmp dir,
             // so far it's enough to take care of concurrently running tests.

From 7b22b5c43321bf729caea4766c7f98b589c405ab Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 18 Jan 2023 11:30:02 +0200
Subject: [PATCH 026/209] Switch to 'tracing' for logging, restructure code to
 make use of spans.

Refactors Compute::prepare_and_run. It's split into subroutines
differently, to make it easier to attach tracing spans to the
different stages. The high-level logic for waiting for Postgres to
exit is moved to the caller.

Replace 'env_logger' with 'tracing', and add `#instrument` directives
to different stages fo the startup process. This is a fairly
mechanical change, except for the changes in 'spec.rs'. 'spec.rs'
contained some complicated formatting, where parts of log messages
were printed directly to stdout with `print`s. That was a bit messed
up because the log normally goes to stderr, but those lines were
printed to stdout. In our docker images, stderr and stdout both go to
the same place so you wouldn't notice, but I don't think it was
intentional.

This changes the log format to the default
'tracing_subscriber::format' format. It's different from the Postgres
log format, however, and because both compute_tools and Postgres print
to the same log, it's now a mix of two different formats.  I'm not
sure how the Grafana log parsing pipeline can handle that. If it's a
problem, we can build custom formatter to change the compute_tools log
format to be the same as Postgres's, like it was before this commit,
or we can change the Postgres log format to match tracing_formatter's,
or we can start printing compute_tool's log output to a different
destination than Postgres
---
 Cargo.lock                              |   6 +-
 compute_tools/Cargo.toml                |   4 +-
 compute_tools/src/bin/compute_ctl.rs    |  55 +++++---
 compute_tools/src/checker.rs            |   4 +-
 compute_tools/src/compute.rs            |  77 ++++++-----
 compute_tools/src/http/api.rs           |   2 +-
 compute_tools/src/informant.rs          |   2 +-
 compute_tools/src/logger.rs             |  44 ++----
 compute_tools/src/monitor.rs            |   2 +-
 compute_tools/src/pg_helpers.rs         |  12 +-
 compute_tools/src/spec.rs               | 171 ++++++++++++++----------
 test_runner/regress/test_compute_ctl.py |   2 +-
 workspace_hack/Cargo.toml               |   5 +-
 13 files changed, 210 insertions(+), 176 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 59adf696a7..d8aba9ba68 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -833,10 +833,8 @@ dependencies = [
  "anyhow",
  "chrono",
  "clap 4.0.32",
- "env_logger",
  "futures",
  "hyper",
- "log",
  "notify",
  "postgres",
  "regex",
@@ -845,6 +843,8 @@ dependencies = [
  "tar",
  "tokio",
  "tokio-postgres",
+ "tracing",
+ "tracing-subscriber",
  "url",
  "workspace_hack",
 ]
@@ -1954,7 +1954,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
 dependencies = [
  "cfg-if",
- "serde",
 ]
 
 [[package]]
@@ -4565,6 +4564,7 @@ dependencies = [
  "tower",
  "tracing",
  "tracing-core",
+ "tracing-subscriber",
  "url",
 ]
 
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 1e0aee81d7..4536604bdf 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -8,10 +8,8 @@ license.workspace = true
 anyhow.workspace = true
 chrono.workspace = true
 clap.workspace = true
-env_logger.workspace = true
 futures.workspace = true
 hyper = { workspace = true, features = ["full"] }
-log = { workspace = true, features = ["std", "serde"] }
 notify.workspace = true
 postgres.workspace = true
 regex.workspace = true
@@ -20,6 +18,8 @@ serde_json.workspace = true
 tar.workspace = true
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
+tracing.workspace = true
+tracing-subscriber.workspace = true
 url.workspace = true
 
 workspace_hack.workspace = true
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 307300cfd8..e5ab8eb153 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -40,7 +40,7 @@ use std::{thread, time::Duration};
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
-use log::{error, info};
+use tracing::{error, info};
 
 use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
 use compute_tools::http::api::launch_http_server;
@@ -53,7 +53,6 @@ use compute_tools::spec::*;
 use url::Url;
 
 fn main() -> Result<()> {
-    // TODO: re-use `utils::logging` later
     init_logger(DEFAULT_LOG_LEVEL)?;
 
     let matches = cli().get_matches();
@@ -122,29 +121,45 @@ fn main() -> Result<()> {
     // Also spawn the thread responsible for handling the VM informant -- if it's present
     let _vm_informant_handle = spawn_vm_informant_if_present().expect("cannot launch VM informant");
 
-    // Run compute (Postgres) and hang waiting on it.
-    match compute.prepare_and_run() {
-        Ok(ec) => {
-            let code = ec.code().unwrap_or(1);
-            info!("Postgres exited with code {}, shutting down", code);
-            exit(code)
-        }
-        Err(error) => {
-            error!("could not start the compute node: {:?}", error);
-
+    // Start Postgres
+    let mut delay_exit = false;
+    let mut exit_code = None;
+    let pg = match compute.start_compute() {
+        Ok(pg) => Some(pg),
+        Err(err) => {
+            error!("could not start the compute node: {:?}", err);
             let mut state = compute.state.write().unwrap();
-            state.error = Some(format!("{:?}", error));
+            state.error = Some(format!("{:?}", err));
             state.status = ComputeStatus::Failed;
             drop(state);
-
-            // Keep serving HTTP requests, so the cloud control plane was able to
-            // get the actual error.
-            info!("giving control plane 30s to collect the error before shutdown");
-            thread::sleep(Duration::from_secs(30));
-            info!("shutting down");
-            Err(error)
+            delay_exit = true;
+            None
         }
+    };
+
+    // Wait for the child Postgres process forever. In this state Ctrl+C will
+    // propagate to Postgres and it will be shut down as well.
+    if let Some(mut pg) = pg {
+        let ecode = pg
+            .wait()
+            .expect("failed to start waiting on Postgres process");
+        info!("Postgres exited with code {}, shutting down", ecode);
+        exit_code = ecode.code()
     }
+
+    if let Err(err) = compute.check_for_core_dumps() {
+        error!("error while checking for core dumps: {err:?}");
+    }
+
+    // If launch failed, keep serving HTTP requests for a while, so the cloud
+    // control plane can get the actual error.
+    if delay_exit {
+        info!("giving control plane 30s to collect the error before shutdown");
+        thread::sleep(Duration::from_secs(30));
+        info!("shutting down");
+    }
+
+    exit(exit_code.unwrap_or(1))
 }
 
 fn cli() -> clap::Command {
diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs
index ee1605c814..b8413de516 100644
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -1,10 +1,11 @@
 use anyhow::{anyhow, Result};
-use log::error;
 use postgres::Client;
 use tokio_postgres::NoTls;
+use tracing::{error, instrument};
 
 use crate::compute::ComputeNode;
 
+#[instrument(skip_all)]
 pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
     let query = "
     CREATE TABLE IF NOT EXISTS health_check (
@@ -21,6 +22,7 @@ pub fn create_writability_check_data(client: &mut Client) -> Result<()> {
     Ok(())
 }
 
+#[instrument(skip_all)]
 pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
     let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
     if client.is_closed() {
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index d652084e00..e229bb1cc2 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -17,15 +17,15 @@
 use std::fs;
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
-use std::process::{Command, ExitStatus, Stdio};
+use std::process::{Command, Stdio};
 use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::RwLock;
 
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
-use log::{error, info, warn};
 use postgres::{Client, NoTls};
 use serde::{Serialize, Serializer};
+use tracing::{info, instrument, warn};
 
 use crate::checker::create_writability_check_data;
 use crate::config;
@@ -121,6 +121,7 @@ impl ComputeNode {
 
     // Get basebackup from the libpq connection to pageserver using `connstr` and
     // unarchive it to `pgdata` directory overriding all its previous content.
+    #[instrument(skip(self))]
     fn get_basebackup(&self, lsn: &str) -> Result<()> {
         let start_time = Utc::now();
 
@@ -154,6 +155,7 @@ impl ComputeNode {
 
     // Run `postgres` in a special mode with `--sync-safekeepers` argument
     // and return the reported LSN back to the caller.
+    #[instrument(skip(self))]
     fn sync_safekeepers(&self) -> Result<String> {
         let start_time = Utc::now();
 
@@ -196,6 +198,7 @@ impl ComputeNode {
 
     /// Do all the preparations like PGDATA directory creation, configuration,
     /// safekeepers sync, basebackup, etc.
+    #[instrument(skip(self))]
     pub fn prepare_pgdata(&self) -> Result<()> {
         let spec = &self.spec;
         let pgdata_path = Path::new(&self.pgdata);
@@ -229,9 +232,8 @@ impl ComputeNode {
 
     /// Start Postgres as a child process and manage DBs/roles.
     /// After that this will hang waiting on the postmaster process to exit.
-    pub fn run(&self) -> Result<ExitStatus> {
-        let start_time = Utc::now();
-
+    #[instrument(skip(self))]
+    pub fn start_postgres(&self) -> Result<std::process::Child> {
         let pgdata_path = Path::new(&self.pgdata);
 
         // Run postgres as a child process.
@@ -242,6 +244,11 @@ impl ComputeNode {
 
         wait_for_postgres(&mut pg, pgdata_path)?;
 
+        Ok(pg)
+    }
+
+    #[instrument(skip(self))]
+    pub fn apply_config(&self) -> Result<()> {
         // If connection fails,
         // it may be the old node with `zenith_admin` superuser.
         //
@@ -279,8 +286,34 @@ impl ComputeNode {
 
         // 'Close' connection
         drop(client);
-        let startup_end_time = Utc::now();
 
+        info!(
+            "finished configuration of compute for project {}",
+            self.spec.cluster.cluster_id
+        );
+
+        Ok(())
+    }
+
+    #[instrument(skip(self))]
+    pub fn start_compute(&self) -> Result<std::process::Child> {
+        info!(
+            "starting compute for project {}, operation {}, tenant {}, timeline {}",
+            self.spec.cluster.cluster_id,
+            self.spec.operation_uuid.as_ref().unwrap(),
+            self.tenant,
+            self.timeline,
+        );
+
+        self.prepare_pgdata()?;
+
+        let start_time = Utc::now();
+
+        let pg = self.start_postgres()?;
+
+        self.apply_config()?;
+
+        let startup_end_time = Utc::now();
         self.metrics.config_ms.store(
             startup_end_time
                 .signed_duration_since(start_time)
@@ -300,35 +333,7 @@ impl ComputeNode {
 
         self.set_status(ComputeStatus::Running);
 
-        info!(
-            "finished configuration of compute for project {}",
-            self.spec.cluster.cluster_id
-        );
-
-        // Wait for child Postgres process basically forever. In this state Ctrl+C
-        // will propagate to Postgres and it will be shut down as well.
-        let ecode = pg
-            .wait()
-            .expect("failed to start waiting on Postgres process");
-
-        if let Err(err) = self.check_for_core_dumps() {
-            error!("error while checking for core dumps: {err:?}");
-        }
-
-        Ok(ecode)
-    }
-
-    pub fn prepare_and_run(&self) -> Result<ExitStatus> {
-        info!(
-            "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            self.spec.cluster.cluster_id,
-            self.spec.operation_uuid.as_ref().unwrap(),
-            self.tenant,
-            self.timeline,
-        );
-
-        self.prepare_pgdata()?;
-        self.run()
+        Ok(pg)
     }
 
     // Look for core dumps and collect backtraces.
@@ -341,7 +346,7 @@ impl ComputeNode {
     //
     // Use that as a default location and pattern, except macos where core dumps are written
     // to /cores/ directory by default.
-    fn check_for_core_dumps(&self) -> Result<()> {
+    pub fn check_for_core_dumps(&self) -> Result<()> {
         let core_dump_dir = match std::env::consts::OS {
             "macos" => Path::new("/cores/"),
             _ => Path::new(&self.pgdata),
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 44f83e5003..f2a49f332c 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -6,8 +6,8 @@ use std::thread;
 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
-use log::{error, info};
 use serde_json;
+use tracing::{error, info};
 
 use crate::compute::ComputeNode;
 
diff --git a/compute_tools/src/informant.rs b/compute_tools/src/informant.rs
index 09bd5e3138..8a6e3ab43a 100644
--- a/compute_tools/src/informant.rs
+++ b/compute_tools/src/informant.rs
@@ -1,8 +1,8 @@
-use log::{info, warn};
 use std::path::Path;
 use std::process;
 use std::thread;
 use std::time::Duration;
+use tracing::{info, warn};
 
 use anyhow::{Context, Result};
 
diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs
index dde0a950f8..57e5496e86 100644
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -1,42 +1,20 @@
-use std::io::Write;
-
 use anyhow::Result;
-use chrono::Utc;
-use env_logger::{Builder, Env};
-
-macro_rules! info_println {
-    ($($tts:tt)*) => {
-        if log_enabled!(Level::Info) {
-            println!($($tts)*);
-        }
-    }
-}
-
-macro_rules! info_print {
-    ($($tts:tt)*) => {
-        if log_enabled!(Level::Info) {
-            print!($($tts)*);
-        }
-    }
-}
+use tracing_subscriber::layer::SubscriberExt;
+use tracing_subscriber::prelude::*;
 
 /// Initialize `env_logger` using either `default_level` or
 /// `RUST_LOG` environment variable as default log level.
 pub fn init_logger(default_level: &str) -> Result<()> {
-    let env = Env::default().filter_or("RUST_LOG", default_level);
+    let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
+        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_level));
 
-    Builder::from_env(env)
-        .format(|buf, record| {
-            let thread_handle = std::thread::current();
-            writeln!(
-                buf,
-                "{} [{}] {}: {}",
-                Utc::now().format("%Y-%m-%d %H:%M:%S%.3f %Z"),
-                thread_handle.name().unwrap_or("main"),
-                record.level(),
-                record.args()
-            )
-        })
+    let fmt_layer = tracing_subscriber::fmt::layer()
+        .with_target(false)
+        .with_writer(std::io::stderr);
+
+    tracing_subscriber::registry()
+        .with(env_filter)
+        .with(fmt_layer)
         .init();
 
     Ok(())
diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index c871422e78..7c9878ffcf 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -3,8 +3,8 @@ use std::{thread, time};
 
 use anyhow::Result;
 use chrono::{DateTime, Utc};
-use log::{debug, info};
 use postgres::{Client, NoTls};
+use tracing::{debug, info};
 
 use crate::compute::ComputeNode;
 
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index ff422f1cf5..921289d7c2 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -11,6 +11,7 @@ use anyhow::{bail, Result};
 use notify::{RecursiveMode, Watcher};
 use postgres::{Client, Transaction};
 use serde::Deserialize;
+use tracing::{debug, instrument};
 
 const POSTGRES_WAIT_TIMEOUT: Duration = Duration::from_millis(60 * 1000); // milliseconds
 
@@ -229,6 +230,7 @@ pub fn get_existing_dbs(client: &mut Client) -> Result<Vec<Database>> {
 /// Wait for Postgres to become ready to accept connections. It's ready to
 /// accept connections when the state-field in `pgdata/postmaster.pid` says
 /// 'ready'.
+#[instrument(skip(pg))]
 pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
     let pid_path = pgdata.join("postmaster.pid");
 
@@ -287,18 +289,18 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
         }
 
         let res = rx.recv_timeout(Duration::from_millis(100));
-        log::debug!("woken up by notify: {res:?}");
+        debug!("woken up by notify: {res:?}");
         // If there are multiple events in the channel already, we only need to be
         // check once. Swallow the extra events before we go ahead to check the
         // pid file.
         while let Ok(res) = rx.try_recv() {
-            log::debug!("swallowing extra event: {res:?}");
+            debug!("swallowing extra event: {res:?}");
         }
 
         // Check that we can open pid file first.
         if let Ok(file) = File::open(&pid_path) {
             if !postmaster_pid_seen {
-                log::debug!("postmaster.pid appeared");
+                debug!("postmaster.pid appeared");
                 watcher
                     .unwatch(pgdata)
                     .expect("Failed to remove pgdata dir watch");
@@ -314,7 +316,7 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
             // Pid file could be there and we could read it, but it could be empty, for example.
             if let Some(Ok(line)) = last_line {
                 let status = line.trim();
-                log::debug!("last line of postmaster.pid: {status:?}");
+                debug!("last line of postmaster.pid: {status:?}");
 
                 // Now Postgres is ready to accept connections
                 if status == "ready" {
@@ -330,7 +332,7 @@ pub fn wait_for_postgres(pg: &mut Child, pgdata: &Path) -> Result<()> {
         }
     }
 
-    log::info!("PostgreSQL is now running, continuing to configure it");
+    tracing::info!("PostgreSQL is now running, continuing to configure it");
 
     Ok(())
 }
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 81e01fe555..40c8366bf4 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,12 +1,11 @@
 use std::path::Path;
 use std::str::FromStr;
-use std::time::Instant;
 
 use anyhow::Result;
-use log::{info, log_enabled, warn, Level};
 use postgres::config::Config;
 use postgres::{Client, NoTls};
 use serde::Deserialize;
+use tracing::{info, info_span, instrument, span_enabled, warn, Level};
 
 use crate::compute::ComputeNode;
 use crate::config;
@@ -80,23 +79,25 @@ pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
 
 /// Given a cluster spec json and open transaction it handles roles creation,
 /// deletion and update.
+#[instrument(skip_all)]
 pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
     let mut xact = client.transaction()?;
     let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
 
     // Print a list of existing Postgres roles (only in debug mode)
-    info!("postgres roles:");
-    for r in &existing_roles {
-        info_println!(
-            "{} - {}:{}",
-            " ".repeat(27 + 5),
-            r.name,
-            if r.encrypted_password.is_some() {
-                "[FILTERED]"
-            } else {
-                "(null)"
-            }
-        );
+    if span_enabled!(Level::INFO) {
+        info!("postgres roles:");
+        for r in &existing_roles {
+            info!(
+                "    - {}:{}",
+                r.name,
+                if r.encrypted_password.is_some() {
+                    "[FILTERED]"
+                } else {
+                    "(null)"
+                }
+            );
+        }
     }
 
     // Process delta operations first
@@ -137,58 +138,68 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
     info!("cluster spec roles:");
     for role in &spec.cluster.roles {
         let name = &role.name;
-
-        info_print!(
-            "{} - {}:{}",
-            " ".repeat(27 + 5),
-            name,
-            if role.encrypted_password.is_some() {
-                "[FILTERED]"
-            } else {
-                "(null)"
-            }
-        );
-
         // XXX: with a limited number of roles it is fine, but consider making it a HashMap
         let pg_role = existing_roles.iter().find(|r| r.name == *name);
 
-        if let Some(r) = pg_role {
-            let mut update_role = false;
-
+        enum RoleAction {
+            None,
+            Update,
+            Create,
+        }
+        let action = if let Some(r) = pg_role {
             if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
                 || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
             {
-                update_role = true;
+                RoleAction::Update
             } else if let Some(pg_pwd) = &r.encrypted_password {
                 // Check whether password changed or not (trim 'md5:' prefix first)
-                update_role = pg_pwd[3..] != *role.encrypted_password.as_ref().unwrap();
+                if pg_pwd[3..] != *role.encrypted_password.as_ref().unwrap() {
+                    RoleAction::Update
+                } else {
+                    RoleAction::None
+                }
+            } else {
+                RoleAction::None
             }
+        } else {
+            RoleAction::Create
+        };
 
-            if update_role {
+        match action {
+            RoleAction::None => {}
+            RoleAction::Update => {
                 let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
-                info_print!(" -> update");
-
                 query.push_str(&role.to_pg_options());
                 xact.execute(query.as_str(), &[])?;
             }
-        } else {
-            info!("role name: '{}'", &name);
-            let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
-            info!("role create query: '{}'", &query);
-            info_print!(" -> create");
+            RoleAction::Create => {
+                let mut query: String = format!("CREATE ROLE {} ", name.pg_quote());
+                info!("role create query: '{}'", &query);
+                query.push_str(&role.to_pg_options());
+                xact.execute(query.as_str(), &[])?;
 
-            query.push_str(&role.to_pg_options());
-            xact.execute(query.as_str(), &[])?;
-
-            let grant_query = format!(
-                "GRANT pg_read_all_data, pg_write_all_data TO {}",
-                name.pg_quote()
-            );
-            xact.execute(grant_query.as_str(), &[])?;
-            info!("role grant query: '{}'", &grant_query);
+                let grant_query = format!(
+                    "GRANT pg_read_all_data, pg_write_all_data TO {}",
+                    name.pg_quote()
+                );
+                xact.execute(grant_query.as_str(), &[])?;
+                info!("role grant query: '{}'", &grant_query);
+            }
         }
 
-        info_print!("\n");
+        if span_enabled!(Level::INFO) {
+            let pwd = if role.encrypted_password.is_some() {
+                "[FILTERED]"
+            } else {
+                "(null)"
+            };
+            let action_str = match action {
+                RoleAction::None => "",
+                RoleAction::Create => " -> create",
+                RoleAction::Update => " -> update",
+            };
+            info!("   - {}:{}{}", name, pwd, action_str);
+        }
     }
 
     xact.commit()?;
@@ -197,6 +208,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 }
 
 /// Reassign all dependent objects and delete requested roles.
+#[instrument(skip_all)]
 pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<()> {
     if let Some(ops) = &node.spec.delta_operations {
         // First, reassign all dependent objects to db owners.
@@ -261,13 +273,16 @@ fn reassign_owned_objects(node: &ComputeNode, role_name: &PgIdent) -> Result<()>
 /// like `CREATE DATABASE` and `DROP DATABASE` do not support it. Statement-level
 /// atomicity should be enough here due to the order of operations and various checks,
 /// which together provide us idempotency.
+#[instrument(skip_all)]
 pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
     let existing_dbs: Vec<Database> = get_existing_dbs(client)?;
 
     // Print a list of existing Postgres databases (only in debug mode)
-    info!("postgres databases:");
-    for r in &existing_dbs {
-        info_println!("{} - {}:{}", " ".repeat(27 + 5), r.name, r.owner);
+    if span_enabled!(Level::INFO) {
+        info!("postgres databases:");
+        for r in &existing_dbs {
+            info!("    {}:{}", r.name, r.owner);
+        }
     }
 
     // Process delta operations first
@@ -310,13 +325,15 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
     for db in &spec.cluster.databases {
         let name = &db.name;
 
-        info_print!("{} - {}:{}", " ".repeat(27 + 5), db.name, db.owner);
-
         // XXX: with a limited number of databases it is fine, but consider making it a HashMap
         let pg_db = existing_dbs.iter().find(|r| r.name == *name);
 
-        let start_time = Instant::now();
-        if let Some(r) = pg_db {
+        enum DatabaseAction {
+            None,
+            Update,
+            Create,
+        }
+        let action = if let Some(r) = pg_db {
             // XXX: db owner name is returned as quoted string from Postgres,
             // when quoting is needed.
             let new_owner = if r.owner.starts_with('"') {
@@ -326,29 +343,42 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
             };
 
             if new_owner != r.owner {
+                // Update the owner
+                DatabaseAction::Update
+            } else {
+                DatabaseAction::None
+            }
+        } else {
+            DatabaseAction::Create
+        };
+
+        match action {
+            DatabaseAction::None => {}
+            DatabaseAction::Update => {
                 let query: String = format!(
                     "ALTER DATABASE {} OWNER TO {}",
                     name.pg_quote(),
                     db.owner.pg_quote()
                 );
-                info_print!(" -> update");
-
+                let _ = info_span!("executing", query).entered();
                 client.execute(query.as_str(), &[])?;
-                let elapsed = start_time.elapsed().as_millis();
-                info_print!(" ({} ms)", elapsed);
             }
-        } else {
-            let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
-            info_print!(" -> create");
+            DatabaseAction::Create => {
+                let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
+                query.push_str(&db.to_pg_options());
+                let _ = info_span!("executing", query).entered();
+                client.execute(query.as_str(), &[])?;
+            }
+        };
 
-            query.push_str(&db.to_pg_options());
-            client.execute(query.as_str(), &[])?;
-
-            let elapsed = start_time.elapsed().as_millis();
-            info_print!(" ({} ms)", elapsed);
+        if span_enabled!(Level::INFO) {
+            let action_str = match action {
+                DatabaseAction::None => "",
+                DatabaseAction::Create => " -> create",
+                DatabaseAction::Update => " -> update",
+            };
+            info!("   - {}:{}{}", db.name, db.owner, action_str);
         }
-
-        info_print!("\n");
     }
 
     Ok(())
@@ -356,6 +386,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
 
 /// Grant CREATE ON DATABASE to the database owner and do some other alters and grants
 /// to allow users creating trusted extensions and re-creating `public` schema, for example.
+#[instrument(skip_all)]
 pub fn handle_grants(node: &ComputeNode, client: &mut Client) -> Result<()> {
     let spec = &node.spec;
 
diff --git a/test_runner/regress/test_compute_ctl.py b/test_runner/regress/test_compute_ctl.py
index f973bd8e60..05ac3841dc 100644
--- a/test_runner/regress/test_compute_ctl.py
+++ b/test_runner/regress/test_compute_ctl.py
@@ -194,7 +194,7 @@ def test_sync_safekeepers_logs(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
         )
     except TimeoutExpired as exc:
         ctl_logs = (exc.stderr or b"").decode("utf-8")
-        log.info("compute_ctl output:\n{ctl_logs}")
+        log.info(f"compute_ctl stderr:\n{ctl_logs}")
 
     with ExternalProcessManager(Path(pgdata) / "postmaster.pid"):
         start = "starting safekeepers syncing"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 8addfcf72e..f4b71ae9b7 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -26,7 +26,7 @@ futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
-log = { version = "0.4", default-features = false, features = ["serde", "std"] }
+log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
@@ -45,6 +45,7 @@ tokio-util = { version = "0.7", features = ["codec", "io"] }
 tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
+tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
 url = { version = "2", features = ["serde"] }
 
 [build-dependencies]
@@ -54,7 +55,7 @@ either = { version = "1" }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
-log = { version = "0.4", default-features = false, features = ["serde", "std"] }
+log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
 prost = { version = "0.11" }

From 300da5b872e105e404eb0842c9302f5742fb164b Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Thu, 19 Jan 2023 10:29:15 -0500
Subject: [PATCH 027/209] Improve layer map docstrings (#3382)

---
 pageserver/src/tenant/layer_map.rs | 52 ++++++++++++++++++++++--------
 1 file changed, 39 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 44bed5959f..01c5359e88 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -250,15 +250,32 @@ where
     L: ?Sized + Layer,
 {
     ///
-    /// Find the latest layer that covers the given 'key', with lsn <
-    /// 'end_lsn'.
+    /// Find the latest layer (by lsn.end) that covers the given
+    /// 'key', with lsn.start < 'end_lsn'.
     ///
-    /// Returns the layer, if any, and an 'lsn_floor' value that
-    /// indicates which portion of the layer the caller should
-    /// check. 'lsn_floor' is normally the start-LSN of the layer, but
-    /// can be greater if there is an overlapping layer that might
-    /// contain the version, even if it's missing from the returned
-    /// layer.
+    /// The caller of this function is the page reconstruction
+    /// algorithm looking for the next relevant delta layer, or
+    /// the terminal image layer. The caller will pass the lsn_floor
+    /// value as end_lsn in the next call to search.
+    ///
+    /// If there's an image layer exactly below the given end_lsn,
+    /// search should return that layer regardless if there are
+    /// overlapping deltas.
+    ///
+    /// If the latest layer is a delta and there is an overlapping
+    /// image with it below, the lsn_floor returned should be right
+    /// above that image so we don't skip it in the search. Otherwise
+    /// the lsn_floor returned should be the bottom of the delta layer
+    /// because we should make as much progress down the lsn axis
+    /// as possible. It's fine if this way we skip some overlapping
+    /// deltas, because the delta we returned would contain the same
+    /// wal content.
+    ///
+    /// TODO: This API is convoluted and inefficient. If the caller
+    /// makes N search calls, we'll end up finding the same latest
+    /// image layer N times. We should either cache the latest image
+    /// layer result, or simplify the api to `get_latest_image` and
+    /// `get_latest_delta`, and only call `get_latest_image` once.
     ///
     /// NOTE: This only searches the 'historic' layers, *not* the
     /// 'open' and 'frozen' layers!
@@ -401,7 +418,9 @@ where
         NUM_ONDISK_LAYERS.dec();
     }
 
-    /// Is there a newer image layer for given key- and LSN-range?
+    /// Is there a newer image layer for given key- and LSN-range? Or a set
+    /// of image layers within the specified lsn range that cover the entire
+    /// specified key range?
     ///
     /// This is used for garbage collection, to determine if an old layer can
     /// be deleted.
@@ -488,8 +507,8 @@ where
 
     ///
     /// Divide the whole given range of keys into sub-ranges based on the latest
-    /// image layer that covers each range. (This is used when creating  new
-    /// image layers)
+    /// image layer that covers each range at the specified lsn (inclusive).
+    /// This is used when creating  new image layers.
     ///
     // FIXME: clippy complains that the result type is very complex. She's probably
     // right...
@@ -541,8 +560,15 @@ where
         Ok(ranges)
     }
 
-    /// Count how many L1 delta layers there are that overlap with the
-    /// given key and LSN range.
+    /// Count the height of the tallest stack of deltas in this 2d region.
+    ///
+    /// This number is used to compute the largest number of deltas that
+    /// we'll need to visit for any page reconstruction in this region.
+    /// We use this heuristic to decide whether to create an image layer.
+    ///
+    /// TODO currently we just return the total number of deltas in the
+    ///      region, no matter if they're stacked on top of each other
+    ///      or next to each other.
     pub fn count_deltas(&self, key_range: &Range<Key>, lsn_range: &Range<Lsn>) -> Result<usize> {
         let mut result = 0;
         if lsn_range.start >= lsn_range.end {

From 262265daad0d1a000d1f425d71ddfb723c3a05e8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 19 Jan 2023 18:49:36 +0100
Subject: [PATCH 028/209] Revert "Use actual temporary dir for pageserver unit
 tests"

This reverts commit 826e89b9ce43ce2c4d046b2c5d6376c3de8dbbac.

The problem with that commit was that it deletes the TempDir while
there are still EphemeralFile instances open.

At first I thought this could be fixed by simply adding

  Handle::current().block_on(task_mgr::shutdown(None, Some(tenant_id), None))

to TenantHarness::drop, but it turned out to be insufficient.

So, reverting the commit until we find a proper solution.

refs https://github.com/neondatabase/neon/issues/3385
---
 .gitignore                                    |   2 +
 control_plane/.gitignore                      |   1 +
 pageserver/src/config.rs                      |   5 +
 pageserver/src/tenant.rs                      | 105 +++++++++++-------
 pageserver/src/tenant/ephemeral_file.rs       |  38 ++++---
 .../src/tenant/remote_timeline_client.rs      |   2 +-
 pageserver/src/virtual_file.rs                |   8 +-
 pageserver/src/walingest.rs                   |  12 +-
 .../src/walreceiver/connection_manager.rs     |  16 +--
 test_runner/sql_regress/.gitignore            |   1 +
 10 files changed, 110 insertions(+), 80 deletions(-)
 create mode 100644 control_plane/.gitignore

diff --git a/.gitignore b/.gitignore
index 2e241ee8cd..f1afdee599 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
 /pg_install
 /target
+/tmp_check
+/tmp_check_cli
 __pycache__/
 test_output/
 .vscode
diff --git a/control_plane/.gitignore b/control_plane/.gitignore
new file mode 100644
index 0000000000..c1e54a6bcb
--- /dev/null
+++ b/control_plane/.gitignore
@@ -0,0 +1 @@
+tmp_check/
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index f3e5fb8c1a..51d1664e52 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -693,6 +693,11 @@ impl PageServerConf {
         Ok(t_conf)
     }
 
+    #[cfg(test)]
+    pub fn test_repo_dir(test_name: &str) -> PathBuf {
+        PathBuf::from(format!("../tmp_check/test_{test_name}"))
+    }
+
     pub fn dummy_conf(repo_dir: PathBuf) -> Self {
         let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c53c9bc3e1..c18c645e5b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2626,10 +2626,10 @@ where
 #[cfg(test)]
 pub mod harness {
     use bytes::{Bytes, BytesMut};
+    use once_cell::sync::Lazy;
     use once_cell::sync::OnceCell;
-    use std::sync::Arc;
+    use std::sync::{Arc, RwLock, RwLockReadGuard, RwLockWriteGuard};
     use std::{fs, path::PathBuf};
-    use tempfile::TempDir;
     use utils::logging;
     use utils::lsn::Lsn;
 
@@ -2661,6 +2661,8 @@ pub mod harness {
         buf.freeze()
     }
 
+    static LOCK: Lazy<RwLock<()>> = Lazy::new(|| RwLock::new(()));
+
     impl From<TenantConf> for TenantConfOpt {
         fn from(tenant_conf: TenantConf) -> Self {
             Self {
@@ -2681,31 +2683,42 @@ pub mod harness {
         }
     }
 
-    /// The harness saves some boilerplate and provides a way to create functional tenant
-    /// without running pageserver binary. It uses temporary directory to store data in it.
-    /// Tempdir gets removed on harness drop.
-    pub struct TenantHarness {
-        // keep the struct to not to remove tmp dir during the test
-        _temp_repo_dir: TempDir,
+    pub struct TenantHarness<'a> {
         pub conf: &'static PageServerConf,
         pub tenant_conf: TenantConf,
         pub tenant_id: TenantId,
+
+        pub lock_guard: (
+            Option<RwLockReadGuard<'a, ()>>,
+            Option<RwLockWriteGuard<'a, ()>>,
+        ),
     }
 
     static LOG_HANDLE: OnceCell<()> = OnceCell::new();
 
-    impl TenantHarness {
-        pub fn new() -> anyhow::Result<Self> {
+    impl<'a> TenantHarness<'a> {
+        pub fn create(test_name: &'static str) -> anyhow::Result<Self> {
+            Self::create_internal(test_name, false)
+        }
+        pub fn create_exclusive(test_name: &'static str) -> anyhow::Result<Self> {
+            Self::create_internal(test_name, true)
+        }
+        fn create_internal(test_name: &'static str, exclusive: bool) -> anyhow::Result<Self> {
+            let lock_guard = if exclusive {
+                (None, Some(LOCK.write().unwrap()))
+            } else {
+                (Some(LOCK.read().unwrap()), None)
+            };
+
             LOG_HANDLE.get_or_init(|| {
                 logging::init(logging::LogFormat::Test).expect("Failed to init test logging")
             });
 
-            let temp_repo_dir = tempfile::tempdir()?;
-            // `TempDir` uses a randomly generated subdirectory of a system tmp dir,
-            // so far it's enough to take care of concurrently running tests.
-            let repo_dir = temp_repo_dir.path();
+            let repo_dir = PageServerConf::test_repo_dir(test_name);
+            let _ = fs::remove_dir_all(&repo_dir);
+            fs::create_dir_all(&repo_dir)?;
 
-            let conf = PageServerConf::dummy_conf(repo_dir.to_path_buf());
+            let conf = PageServerConf::dummy_conf(repo_dir);
             // Make a static copy of the config. This can never be free'd, but that's
             // OK in a test.
             let conf: &'static PageServerConf = Box::leak(Box::new(conf));
@@ -2723,10 +2736,10 @@ pub mod harness {
             fs::create_dir_all(conf.timelines_path(&tenant_id))?;
 
             Ok(Self {
-                _temp_repo_dir: temp_repo_dir,
                 conf,
                 tenant_conf,
                 tenant_id,
+                lock_guard,
             })
         }
 
@@ -2820,8 +2833,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_basic() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_basic")?.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -2854,8 +2866,9 @@ mod tests {
 
     #[tokio::test]
     async fn no_duplicate_timelines() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("no_duplicate_timelines")?
+            .load()
+            .await;
         let _ = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -2886,8 +2899,7 @@ mod tests {
     ///
     #[tokio::test]
     async fn test_branch() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_branch")?.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -2984,8 +2996,10 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant =
+            TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
+                .load()
+                .await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3020,8 +3034,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
+            .load()
+            .await;
 
         tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?
@@ -3070,8 +3085,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
+            .load()
+            .await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3093,8 +3109,9 @@ mod tests {
     }
     #[tokio::test]
     async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
+            .load()
+            .await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3125,7 +3142,8 @@ mod tests {
 
     #[tokio::test]
     async fn timeline_load() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        const TEST_NAME: &str = "timeline_load";
+        let harness = TenantHarness::create(TEST_NAME)?;
         {
             let tenant = harness.load().await;
             let tline = tenant
@@ -3144,7 +3162,8 @@ mod tests {
 
     #[tokio::test]
     async fn timeline_load_with_ancestor() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        const TEST_NAME: &str = "timeline_load_with_ancestor";
+        let harness = TenantHarness::create(TEST_NAME)?;
         // create two timelines
         {
             let tenant = harness.load().await;
@@ -3182,7 +3201,8 @@ mod tests {
 
     #[tokio::test]
     async fn corrupt_metadata() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        const TEST_NAME: &str = "corrupt_metadata";
+        let harness = TenantHarness::create(TEST_NAME)?;
         let tenant = harness.load().await;
 
         tenant
@@ -3223,8 +3243,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_images() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_images")?.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3291,8 +3310,7 @@ mod tests {
     //
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_bulk_insert")?.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3336,8 +3354,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_random_updates")?.load().await;
         let tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3410,8 +3427,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_branches() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_traverse_branches")?
+            .load()
+            .await;
         let mut tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
@@ -3495,8 +3513,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_ancestors() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_traverse_ancestors")?
+            .load()
+            .await;
         let mut tline = tenant
             .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
             .initialize()?;
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 0debeaff1c..c433e65ad2 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -76,7 +76,7 @@ impl EphemeralFile {
         })
     }
 
-    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> io::Result<()> {
+    fn fill_buffer(&self, buf: &mut [u8], blkno: u32) -> Result<(), io::Error> {
         let mut off = 0;
         while off < PAGE_SZ {
             let n = self
@@ -277,7 +277,7 @@ impl Drop for EphemeralFile {
     }
 }
 
-pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> io::Result<()> {
+pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
     if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
         match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
             Ok(_) => Ok(()),
@@ -332,17 +332,25 @@ mod tests {
     use super::*;
     use crate::tenant::blob_io::{BlobCursor, BlobWriter};
     use crate::tenant::block_io::BlockCursor;
-    use crate::tenant::harness::TenantHarness;
     use rand::{seq::SliceRandom, thread_rng, RngCore};
     use std::fs;
     use std::str::FromStr;
 
-    fn harness() -> Result<(TenantHarness, TimelineId), io::Error> {
-        let harness = TenantHarness::new().expect("Failed to create tenant harness");
-        let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
-        fs::create_dir_all(harness.timeline_path(&timeline_id))?;
+    fn harness(
+        test_name: &str,
+    ) -> Result<(&'static PageServerConf, TenantId, TimelineId), io::Error> {
+        let repo_dir = PageServerConf::test_repo_dir(test_name);
+        let _ = fs::remove_dir_all(&repo_dir);
+        let conf = PageServerConf::dummy_conf(repo_dir);
+        // Make a static copy of the config. This can never be free'd, but that's
+        // OK in a test.
+        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
 
-        Ok((harness, timeline_id))
+        let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
+        let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
+        fs::create_dir_all(conf.timeline_path(&timeline_id, &tenant_id))?;
+
+        Ok((conf, tenant_id, timeline_id))
     }
 
     // Helper function to slurp contents of a file, starting at the current position,
@@ -359,10 +367,10 @@ mod tests {
     }
 
     #[test]
-    fn test_ephemeral_files() -> io::Result<()> {
-        let (harness, timeline_id) = harness()?;
+    fn test_ephemeral_files() -> Result<(), io::Error> {
+        let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?;
 
-        let file_a = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
+        let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?;
 
         file_a.write_all_at(b"foo", 0)?;
         assert_eq!("foo", read_string(&file_a, 0, 20)?);
@@ -373,7 +381,7 @@ mod tests {
         // Open a lot of files, enough to cause some page evictions.
         let mut efiles = Vec::new();
         for fileno in 0..100 {
-            let efile = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
+            let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?;
             efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?;
             assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
             efiles.push((fileno, efile));
@@ -390,10 +398,10 @@ mod tests {
     }
 
     #[test]
-    fn test_ephemeral_blobs() -> io::Result<()> {
-        let (harness, timeline_id) = harness()?;
+    fn test_ephemeral_blobs() -> Result<(), io::Error> {
+        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
 
-        let mut file = EphemeralFile::create(harness.conf, harness.tenant_id, timeline_id)?;
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id)?;
 
         let pos_foo = file.write_blob(b"foo")?;
         assert_eq!(b"foo", file.block_cursor().read_blob(pos_foo)?.as_slice());
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 58b7eea1eb..013591caee 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1064,7 +1064,7 @@ mod tests {
     // Test scheduling
     #[test]
     fn upload_scheduling() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("upload_scheduling")?;
         let timeline_path = harness.timeline_path(&TIMELINE_ID);
         std::fs::create_dir_all(&timeline_path)?;
 
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 3ad049cc21..fb216123c1 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -525,13 +525,12 @@ mod tests {
         })
     }
 
-    fn test_files<OF, FD>(test_name: &str, openfunc: OF) -> Result<(), Error>
+    fn test_files<OF, FD>(testname: &str, openfunc: OF) -> Result<(), Error>
     where
         FD: Read + Write + Seek + FileExt,
         OF: Fn(&Path, &OpenOptions) -> Result<FD, std::io::Error>,
     {
-        let temp_repo_dir = tempfile::tempdir()?;
-        let testdir = temp_repo_dir.path().join(test_name);
+        let testdir = crate::config::PageServerConf::test_repo_dir(testname);
         std::fs::create_dir_all(&testdir)?;
 
         let path_a = testdir.join("file_a");
@@ -633,8 +632,7 @@ mod tests {
         const THREADS: usize = 100;
         const SAMPLE: [u8; SIZE] = [0xADu8; SIZE];
 
-        let temp_repo_dir = tempfile::tempdir()?;
-        let testdir = temp_repo_dir.path().join("vfile_concurrency");
+        let testdir = crate::config::PageServerConf::test_repo_dir("vfile_concurrency");
         std::fs::create_dir_all(&testdir)?;
 
         // Create a test file.
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 77fce95160..0de2e6654d 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1146,8 +1146,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_relsize() -> Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_relsize")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
@@ -1324,8 +1323,7 @@ mod tests {
     // and then created it again within the same layer.
     #[tokio::test]
     async fn test_drop_extend() -> Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_drop_extend")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
@@ -1378,8 +1376,7 @@ mod tests {
     // and then extended it again within the same layer.
     #[tokio::test]
     async fn test_truncate_extend() -> Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
@@ -1500,8 +1497,7 @@ mod tests {
     /// split into multiple 1 GB segments in Postgres.
     #[tokio::test]
     async fn test_large_rel() -> Result<()> {
-        let harness = TenantHarness::new()?;
-        let tenant = harness.load().await;
+        let tenant = TenantHarness::create("test_large_rel")?.load().await;
         let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
         let mut walingest = init_walingest_test(&tline).await?;
 
diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/walreceiver/connection_manager.rs
index be58aa0e07..8b60e59305 100644
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/walreceiver/connection_manager.rs
@@ -846,7 +846,7 @@ mod tests {
 
     #[tokio::test]
     async fn no_connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("no_connection_no_candidate")?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -879,7 +879,7 @@ mod tests {
 
     #[tokio::test]
     async fn connection_no_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("connection_no_candidate")?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -942,7 +942,7 @@ mod tests {
 
     #[tokio::test]
     async fn no_connection_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("no_connection_candidate")?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1001,7 +1001,7 @@ mod tests {
 
     #[tokio::test]
     async fn candidate_with_many_connection_failures() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("candidate_with_many_connection_failures")?;
         let mut state = dummy_state(&harness).await;
         let now = Utc::now().naive_utc();
 
@@ -1041,7 +1041,7 @@ mod tests {
 
     #[tokio::test]
     async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
@@ -1105,7 +1105,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
@@ -1166,7 +1166,7 @@ mod tests {
 
     #[tokio::test]
     async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::new()?;
+        let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let new_lsn = Lsn(100_100).align();
@@ -1232,7 +1232,7 @@ mod tests {
 
     const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
 
-    async fn dummy_state(harness: &TenantHarness) -> WalreceiverState {
+    async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
         WalreceiverState {
             id: TenantTimelineId {
                 tenant_id: harness.tenant_id,
diff --git a/test_runner/sql_regress/.gitignore b/test_runner/sql_regress/.gitignore
index 83186b5c86..89129d7358 100644
--- a/test_runner/sql_regress/.gitignore
+++ b/test_runner/sql_regress/.gitignore
@@ -2,6 +2,7 @@
 /pg_regress
 
 # Generated subdirectories
+/tmp_check/
 /results/
 /log/
 

From 47f9890bae75c63fc4b29c009cf9020ac2bcbafa Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 20 Jan 2023 15:37:24 +0100
Subject: [PATCH 029/209] [compute_ctl] Make role deletion spec processing
 idempotent (#3380)

Previously, we were trying to re-assign owned objects of the already
deleted role. This were causing a crash loop in the case when compute
was restarted with a spec that includes delta operation for role
deletion. To avoid such cases, check that role is still present before
calling `reassign_owned_objects`.

Resolves neondatabase/cloud#3553
---
 compute_tools/src/compute.rs    |  3 ++-
 compute_tools/src/pg_helpers.rs |  4 ++--
 compute_tools/src/spec.rs       | 14 +++++++++++++-
 3 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index e229bb1cc2..c8af8822b7 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -252,7 +252,7 @@ impl ComputeNode {
         // If connection fails,
         // it may be the old node with `zenith_admin` superuser.
         //
-        // In this case we need to connect with old `zenith_admin`name
+        // In this case we need to connect with old `zenith_admin` name
         // and create new user. We cannot simply rename connected user,
         // but we can create a new one and grant it all privileges.
         let mut client = match Client::connect(self.connstr.as_str(), NoTls) {
@@ -278,6 +278,7 @@ impl ComputeNode {
             Ok(client) => client,
         };
 
+        // Proceed with post-startup configuration. Note, that order of operations is important.
         handle_roles(&self.spec, &mut client)?;
         handle_databases(&self.spec, &mut client)?;
         handle_role_deletions(self, &mut client)?;
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 921289d7c2..6ab2864721 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -130,8 +130,8 @@ impl Role {
     /// Serialize a list of role parameters into a Postgres-acceptable
     /// string of arguments.
     pub fn to_pg_options(&self) -> String {
-        // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in Rails.
-        // For now we do not use generic `options` for roles. Once used, add
+        // XXX: consider putting LOGIN as a default option somewhere higher, e.g. in control-plane.
+        // For now, we do not use generic `options` for roles. Once used, add
         // `self.options.as_pg_options()` somewhere here.
         let mut params: String = "LOGIN".to_string();
 
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 40c8366bf4..97cd623052 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -213,8 +213,20 @@ pub fn handle_role_deletions(node: &ComputeNode, client: &mut Client) -> Result<
     if let Some(ops) = &node.spec.delta_operations {
         // First, reassign all dependent objects to db owners.
         info!("reassigning dependent objects of to-be-deleted roles");
+
+        // Fetch existing roles. We could've exported and used `existing_roles` from
+        // `handle_roles()`, but we only make this list there before creating new roles.
+        // Which is probably fine as we never create to-be-deleted roles, but that'd
+        // just look a bit untidy. Anyway, the entire `pg_roles` should be in shared
+        // buffers already, so this shouldn't be a big deal.
+        let mut xact = client.transaction()?;
+        let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;
+        xact.commit()?;
+
         for op in ops {
-            if op.action == "delete_role" {
+            // Check that role is still present in Postgres, as this could be a
+            // restart with the same spec after role deletion.
+            if op.action == "delete_role" && existing_roles.iter().any(|r| r.name == op.name) {
                 reassign_owned_objects(node, &op.name)?;
             }
         }

From 802f17407259bf9ad027480721083662f66b13b1 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 20 Jan 2023 18:19:52 +0200
Subject: [PATCH 030/209] fix: dont stop pageserver if we fail to calculate
 synthetic size

---
 pageserver/src/consumption_metrics.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index f8a0bc6f08..c07026261d 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -59,7 +59,7 @@ pub async fn collect_metrics(
         None,
         None,
         "synthetic size calculation",
-        true,
+        false,
         async move {
             calculate_synthetic_size_worker(synthetic_size_calculation_interval)
                 .instrument(info_span!("synthetic_size_worker"))

From 478322ebf90f8580258b02e1fb5c899c7f8ad279 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 20 Jan 2023 20:21:36 +0200
Subject: [PATCH 031/209] Fix tenant size orphans (#3377)

Before only the timelines which have passed the `gc_horizon` were
processed which failed with orphans at the tree_sort phase. Example
input in added `test_branched_empty_timeline_size` test case.

The PR changes iteration to happen through all timelines, and in
addition to that, any learned branch points will be calculated as they
would had been in the original implementation if the ancestor branch had
been over the `gc_horizon`.

This also changes how tenants where all timelines are below `gc_horizon`
are handled. Previously tenant_size 0 was returned, but now they will
have approximately `initdb_lsn` worth of tenant_size.

The PR also adds several new tenant size tests that describe various corner
cases of branching structure and `gc_horizon` setting.
They are currently disabled to not consume time during CI.

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
---
 pageserver/src/tenant/size.rs           | 149 ++++++++++++---
 test_runner/fixtures/neon_fixtures.py   |   9 +-
 test_runner/regress/test_tenant_size.py | 244 ++++++++++++++++++++++--
 3 files changed, 360 insertions(+), 42 deletions(-)

diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index dd4bf768a7..2181d6d4dc 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -23,7 +23,13 @@ use tracing::*;
 pub struct ModelInputs {
     updates: Vec<Update>,
     retention_period: u64,
+
+    /// Relevant lsns per timeline.
+    ///
+    /// This field is not required for deserialization purposes, which is mostly used in tests. The
+    /// LSNs explain the outcome (updates) but are not needed in size calculation.
     #[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
+    #[serde(default)]
     timeline_inputs: HashMap<TimelineId, TimelineInputs>,
 }
 
@@ -32,6 +38,8 @@ pub struct ModelInputs {
 #[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 struct TimelineInputs {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    ancestor_lsn: Lsn,
     #[serde_as(as = "serde_with::DisplayFromStr")]
     last_record: Lsn,
     #[serde_as(as = "serde_with::DisplayFromStr")]
@@ -178,21 +186,13 @@ pub(super) async fn gather_inputs(
     // our advantage with `?` error handling.
     let mut joinset = tokio::task::JoinSet::new();
 
-    let timelines = tenant
+    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
+    tenant
         .refresh_gc_info()
         .await
         .context("Failed to refresh gc_info before gathering inputs")?;
 
-    if timelines.is_empty() {
-        // All timelines are below tenant's gc_horizon; alternative would be to use
-        // Tenant::list_timelines but then those gc_info's would not be updated yet, possibly
-        // missing GcInfo::retain_lsns or having obsolete values for cutoff's.
-        return Ok(ModelInputs {
-            updates: vec![],
-            retention_period: 0,
-            timeline_inputs: HashMap::new(),
-        });
-    }
+    let timelines = tenant.list_timelines();
 
     // record the used/inserted cache keys here, to remove extras not to start leaking
     // after initial run the cache should be quite stable, but live timelines will eventually
@@ -201,13 +201,25 @@ pub(super) async fn gather_inputs(
 
     let mut updates = Vec::new();
 
-    // record the per timline values used to determine `retention_period`
+    // record the per timeline values useful to debug the model inputs, also used to track
+    // ancestor_lsn without keeping a hold of Timeline
     let mut timeline_inputs = HashMap::with_capacity(timelines.len());
 
     // used to determine the `retention_period` for the size model
     let mut max_cutoff_distance = None;
 
+    // mapping from (TimelineId, Lsn) => if this branch point has been handled already via
+    // GcInfo::retain_lsns or if it needs to have its logical_size calculated.
+    let mut referenced_branch_froms = HashMap::<(TimelineId, Lsn), bool>::new();
+
     for timeline in timelines {
+        if !timeline.is_active() {
+            anyhow::bail!(
+                "timeline {} is not active, cannot calculate tenant_size now",
+                timeline.timeline_id
+            );
+        }
+
         let last_record_lsn = timeline.get_last_record_lsn();
 
         let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = {
@@ -273,13 +285,30 @@ pub(super) async fn gather_inputs(
 
         // all timelines branch from something, because it might be impossible to pinpoint
         // which is the tenant_size_model's "default" branch.
+
+        let ancestor_lsn = timeline.get_ancestor_lsn();
+
         updates.push(Update {
-            lsn: timeline.get_ancestor_lsn(),
+            lsn: ancestor_lsn,
             command: Command::BranchFrom(timeline.get_ancestor_timeline_id()),
             timeline_id: timeline.timeline_id,
         });
 
+        if let Some(parent_timeline_id) = timeline.get_ancestor_timeline_id() {
+            // refresh_gc_info will update branchpoints and pitr_cutoff but only do it for branches
+            // which are over gc_horizon. for example, a "main" branch which never received any
+            // updates apart from initdb not have branch points recorded.
+            referenced_branch_froms
+                .entry((parent_timeline_id, timeline.get_ancestor_lsn()))
+                .or_default();
+        }
+
         for (lsn, _kind) in &interesting_lsns {
+            // mark this visited so don't need to re-process this parent
+            *referenced_branch_froms
+                .entry((timeline.timeline_id, *lsn))
+                .or_default() = true;
+
             if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) {
                 updates.push(Update {
                     lsn: *lsn,
@@ -295,22 +324,10 @@ pub(super) async fn gather_inputs(
             }
         }
 
-        // all timelines also have an end point if they have made any progress
-        if last_record_lsn > timeline.get_ancestor_lsn()
-            && !interesting_lsns
-                .iter()
-                .any(|(lsn, _)| lsn == &last_record_lsn)
-        {
-            updates.push(Update {
-                lsn: last_record_lsn,
-                command: Command::EndOfBranch,
-                timeline_id: timeline.timeline_id,
-            });
-        }
-
         timeline_inputs.insert(
             timeline.timeline_id,
             TimelineInputs {
+                ancestor_lsn,
                 last_record: last_record_lsn,
                 // this is not used above, because it might not have updated recently enough
                 latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
@@ -321,6 +338,80 @@ pub(super) async fn gather_inputs(
         );
     }
 
+    // iterate over discovered branch points and make sure we are getting logical sizes at those
+    // points.
+    for ((timeline_id, lsn), handled) in referenced_branch_froms.iter() {
+        if *handled {
+            continue;
+        }
+
+        let timeline_id = *timeline_id;
+        let lsn = *lsn;
+
+        match timeline_inputs.get(&timeline_id) {
+            Some(inputs) if inputs.ancestor_lsn == lsn => {
+                // we don't need an update at this branch point which is also point where
+                // timeline_id branch was branched from.
+                continue;
+            }
+            Some(_) => {}
+            None => {
+                // we should have this because we have iterated through all of the timelines
+                anyhow::bail!("missing timeline_input for {timeline_id}")
+            }
+        }
+
+        if let Some(size) = logical_size_cache.get(&(timeline_id, lsn)) {
+            updates.push(Update {
+                lsn,
+                timeline_id,
+                command: Command::Update(*size),
+            });
+
+            needed_cache.insert((timeline_id, lsn));
+        } else {
+            let timeline = tenant
+                .get_timeline(timeline_id, false)
+                .context("find referenced ancestor timeline")?;
+            let parallel_size_calcs = Arc::clone(limit);
+            joinset.spawn(calculate_logical_size(
+                parallel_size_calcs,
+                timeline.clone(),
+                lsn,
+            ));
+
+            if let Some(parent_id) = timeline.get_ancestor_timeline_id() {
+                // we should not find new ones because we iterated tenants all timelines
+                anyhow::ensure!(
+                    timeline_inputs.contains_key(&parent_id),
+                    "discovered new timeline {parent_id} (parent of {timeline_id})"
+                );
+            }
+        };
+    }
+
+    // finally add in EndOfBranch for all timelines where their last_record_lsn is not a branch
+    // point. this is needed by the model.
+    for (timeline_id, inputs) in timeline_inputs.iter() {
+        let lsn = inputs.last_record;
+
+        if referenced_branch_froms.contains_key(&(*timeline_id, lsn)) {
+            // this means that the (timeline_id, last_record_lsn) represents a branch point
+            // we do not want to add EndOfBranch updates for these points because it doesn't fit
+            // into the current tenant_size_model.
+            continue;
+        }
+
+        if lsn > inputs.ancestor_lsn {
+            // all timelines also have an end point if they have made any progress
+            updates.push(Update {
+                lsn,
+                command: Command::EndOfBranch,
+                timeline_id: *timeline_id,
+            });
+        }
+    }
+
     let mut have_any_error = false;
 
     while let Some(res) = joinset.join_next().await {
@@ -379,6 +470,7 @@ pub(super) async fn gather_inputs(
     // handled by the variant order in `Command`.
     //
     updates.sort_unstable();
+
     // And another sort to handle Command::BranchFrom ordering
     // in case when there are multiple branches at the same LSN.
     let sorted_updates = sort_updates_in_tree_order(updates)?;
@@ -574,7 +666,10 @@ fn updates_sort() {
 fn verify_size_for_multiple_branches() {
     // this is generated from integration test test_tenant_size_with_multiple_branches, but this way
     // it has the stable lsn's
-    let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072,"timeline_inputs":{"cd9d9409c216e64bf580904facedb01b":{"last_record":"0/18D5E40","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/18B5E40","pitr_cutoff":"0/18B5E40","next_gc_cutoff":"0/18B5E40"},"10b532a550540bc15385eac4edde416a":{"last_record":"0/1839818","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/1819818","pitr_cutoff":"0/1819818","next_gc_cutoff":"0/1819818"},"230fc9d756f7363574c0d66533564dcc":{"last_record":"0/222F438","latest_gc_cutoff":"0/169ACF0","horizon_cutoff":"0/220F438","pitr_cutoff":"0/220F438","next_gc_cutoff":"0/220F438"}}}"#;
+    //
+    // timelineinputs have been left out, because those explain the inputs, but don't participate
+    // in further size calculations.
+    let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072}"#;
 
     let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index d6c4c32b0b..8476066691 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1206,6 +1206,9 @@ class PageserverHttpClient(requests.Session):
         return res_json
 
     def tenant_size(self, tenant_id: TenantId) -> int:
+        return self.tenant_size_and_modelinputs(tenant_id)[0]
+
+    def tenant_size_and_modelinputs(self, tenant_id: TenantId) -> Tuple[int, Dict[str, Any]]:
         """
         Returns the tenant size, together with the model inputs as the second tuple item.
         """
@@ -1216,9 +1219,9 @@ class PageserverHttpClient(requests.Session):
         assert TenantId(res["id"]) == tenant_id
         size = res["size"]
         assert type(size) == int
-        # there are additional inputs, which are the collected raw information before being fed to the tenant_size_model
-        # there are no tests for those right now.
-        return size
+        inputs = res["inputs"]
+        assert type(inputs) is dict
+        return (size, inputs)
 
     def timeline_list(
         self,
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 5747ae235f..72cfbc9dda 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,5 +1,6 @@
-from typing import List, Tuple
+from typing import Any, List, Tuple
 
+import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn
 from fixtures.types import Lsn
@@ -9,28 +10,247 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv):
     env = neon_simple_env
     (tenant_id, _) = env.neon_cli.create_tenant()
     http_client = env.pageserver.http_client()
-    size = http_client.tenant_size(tenant_id)
+    initial_size = http_client.tenant_size(tenant_id)
 
-    # we should never have zero, because there should be the initdb however
-    # this is questionable if we should have anything in this case, as the
-    # gc_cutoff is negative
-    assert (
-        size == 0
-    ), "initial implementation returns zero tenant_size before last_record_lsn is past gc_horizon"
+    # we should never have zero, because there should be the initdb "changes"
+    assert initial_size > 0, "initial implementation returns ~initdb tenant_size"
 
-    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+    main_branch_name = "main"
+
+    with env.postgres.create_start(
+        main_branch_name,
+        tenant_id=tenant_id,
+        config_lines=["autovacuum=off", "checkpoint_timeout=10min"],
+    ) as pg:
         with pg.cursor() as cur:
             cur.execute("SELECT 1")
             row = cur.fetchone()
             assert row is not None
             assert row[0] == 1
         size = http_client.tenant_size(tenant_id)
-        assert size == 0, "starting idle compute should not change the tenant size"
+        # we've disabled the autovacuum and checkpoint
+        # so background processes should not change the size.
+        # If this test will flake we should probably loosen the check
+        assert size == initial_size, "starting idle compute should not change the tenant size"
 
     # the size should be the same, until we increase the size over the
     # gc_horizon
-    size = http_client.tenant_size(tenant_id)
-    assert size == 0, "tenant_size should not be affected by shutdown of compute"
+    size, inputs = http_client.tenant_size_and_modelinputs(tenant_id)
+    assert size == initial_size, "tenant_size should not be affected by shutdown of compute"
+
+    expected_commands: List[Any] = [{"branch_from": None}, "end_of_branch"]
+    actual_commands: List[Any] = list(map(lambda x: x["command"], inputs["updates"]))  # type: ignore
+    assert actual_commands == expected_commands
+
+
+def test_branched_empty_timeline_size(neon_simple_env: NeonEnv):
+    """
+    Issue found in production. Because the ancestor branch was under
+    gc_horizon, the branchpoint was "dangling" and the computation could not be
+    done.
+
+    Assuming gc_horizon = 50
+    root:    I      0---10------>20
+    branch:              |-------------------I---------->150
+                                   gc_horizon
+    """
+    env = neon_simple_env
+    (tenant_id, _) = env.neon_cli.create_tenant()
+    http_client = env.pageserver.http_client()
+
+    initial_size = http_client.tenant_size(tenant_id)
+
+    first_branch_timeline_id = env.neon_cli.create_branch("first-branch", tenant_id=tenant_id)
+
+    with env.postgres.create_start("first-branch", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute(
+                "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+        wait_for_last_flush_lsn(env, pg, tenant_id, first_branch_timeline_id)
+
+    size_after_branching = http_client.tenant_size(tenant_id)
+    log.info(f"size_after_branching: {size_after_branching}")
+
+    assert size_after_branching > initial_size
+
+
+def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv):
+    """
+    More general version of test_branched_empty_timeline_size
+
+    Assuming gc_horizon = 50
+
+    root:  I 0------10
+    first: I        10
+    nth_0: I        10
+    nth_1: I        10
+    nth_n:          10------------I--------100
+    """
+    env = neon_simple_env
+    (tenant_id, _) = env.neon_cli.create_tenant()
+    http_client = env.pageserver.http_client()
+
+    initial_size = http_client.tenant_size(tenant_id)
+
+    first_branch_name = "first"
+    env.neon_cli.create_branch(first_branch_name, tenant_id=tenant_id)
+
+    size_after_branching = http_client.tenant_size(tenant_id)
+
+    # this might be flaky like test_get_tenant_size_with_multiple_branches
+    # https://github.com/neondatabase/neon/issues/2962
+    assert size_after_branching == initial_size
+
+    last_branch_name = first_branch_name
+    last_branch = None
+
+    for i in range(0, 4):
+        latest_branch_name = f"nth_{i}"
+        last_branch = env.neon_cli.create_branch(
+            latest_branch_name, ancestor_branch_name=last_branch_name, tenant_id=tenant_id
+        )
+        last_branch_name = latest_branch_name
+
+        size_after_branching = http_client.tenant_size(tenant_id)
+        assert size_after_branching == initial_size
+
+    assert last_branch is not None
+
+    with env.postgres.create_start(last_branch_name, tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute(
+                "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+        wait_for_last_flush_lsn(env, pg, tenant_id, last_branch)
+
+    size_after_writes = http_client.tenant_size(tenant_id)
+    assert size_after_writes > initial_size
+
+
+@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
+def test_branch_point_within_horizon(neon_simple_env: NeonEnv):
+    """
+    gc_horizon = 15
+
+    main:          0--I-10------>20
+    branch:              |-------------------I---------->150
+                                   gc_horizon
+    """
+
+    env = neon_simple_env
+    gc_horizon = 20_000
+    (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)})
+    http_client = env.pageserver.http_client()
+
+    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+        initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)")
+        flushed_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+
+    size_before_branching = http_client.tenant_size(tenant_id)
+
+    assert flushed_lsn.lsn_int - gc_horizon > initdb_lsn.lsn_int
+
+    branch_id = env.neon_cli.create_branch(
+        "branch", tenant_id=tenant_id, ancestor_start_lsn=flushed_lsn
+    )
+
+    with env.postgres.create_start("branch", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)")
+        wait_for_last_flush_lsn(env, pg, tenant_id, branch_id)
+
+    size_after = http_client.tenant_size(tenant_id)
+
+    assert size_before_branching < size_after
+
+
+@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
+def test_parent_within_horizon(neon_simple_env: NeonEnv):
+    """
+    gc_horizon = 5
+
+    main:          0----10----I->20
+    branch:              |-------------------I---------->150
+                                   gc_horizon
+    """
+
+    env = neon_simple_env
+    gc_horizon = 200_000
+    (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)})
+    http_client = env.pageserver.http_client()
+
+    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+        initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000) s(i)")
+
+        flushed_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t00 AS SELECT i::bigint n FROM generate_series(0, 2000) s(i)")
+
+        wait_for_last_flush_lsn(env, pg, tenant_id, main_id)
+
+    size_before_branching = http_client.tenant_size(tenant_id)
+
+    assert flushed_lsn.lsn_int - gc_horizon > initdb_lsn.lsn_int
+
+    branch_id = env.neon_cli.create_branch(
+        "branch", tenant_id=tenant_id, ancestor_start_lsn=flushed_lsn
+    )
+
+    with env.postgres.create_start("branch", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
+        wait_for_last_flush_lsn(env, pg, tenant_id, branch_id)
+
+    size_after = http_client.tenant_size(tenant_id)
+
+    assert size_before_branching < size_after
+
+
+@pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
+def test_only_heads_within_horizon(neon_simple_env: NeonEnv):
+    """
+    gc_horizon = small
+
+    main: 0--------10-----I>20
+    first:         |-----------------------------I>150
+    second:        |---------I>30
+    """
+
+    env = neon_simple_env
+    (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": "1024"})
+    http_client = env.pageserver.http_client()
+
+    initial_size = http_client.tenant_size(tenant_id)
+
+    first_id = env.neon_cli.create_branch("first", tenant_id=tenant_id)
+    second_id = env.neon_cli.create_branch("second", tenant_id=tenant_id)
+
+    ids = {"main": main_id, "first": first_id, "second": second_id}
+
+    latest_size = None
+
+    # gc is not expected to change the results
+
+    for branch_name, amount in [("main", 2000), ("first", 15000), ("second", 3000)]:
+        with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg:
+            with pg.cursor() as cur:
+                cur.execute(
+                    f"CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, {amount}) s(i)"
+                )
+            wait_for_last_flush_lsn(env, pg, tenant_id, ids[branch_name])
+            size_now = http_client.tenant_size(tenant_id)
+            if latest_size is not None:
+                assert size_now > latest_size
+            else:
+                assert size_now > initial_size
+
+            latest_size = size_now
 
 
 def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):

From 664a69e65ba79c8a3b6c5bd8d428de41f2243bd7 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Mon, 23 Jan 2023 10:51:09 +0200
Subject: [PATCH 032/209] Fix slru_segment_key_range function: segno was
 assigned to incorrect Key field (#3354)

---
 pageserver/src/pgdatadir_mapping.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 6ae70e3a30..cc521c5e35 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1405,15 +1405,15 @@ fn slru_segment_key_range(kind: SlruKind, segno: u32) -> Range<Key> {
     Key {
         field1: 0x01,
         field2,
-        field3: segno,
-        field4: 0,
+        field3: 1,
+        field4: segno,
         field5: 0,
         field6: 0,
     }..Key {
         field1: 0x01,
         field2,
-        field3: segno,
-        field4: 0,
+        field3: 1,
+        field4: segno,
         field5: 1,
         field6: 0,
     }

From edb02d3299772e9075561a3b41141d42894e3287 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 23 Jan 2023 15:08:48 +0200
Subject: [PATCH 033/209] Adding pageserver3 to staging (#3403)

---
 .github/ansible/staging.us-east-2.hosts.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index 4891875369..b46e729e32 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -29,6 +29,8 @@ storage:
           ansible_host: i-0565a8b4008aa3f40
         pageserver-2.us-east-2.aws.neon.build:
           ansible_host: i-01e31cdf7e970586a
+        pageserver-3.us-east-2.aws.neon.build:
+          ansible_host: i-0602a0291365ef7cc
 
     safekeepers:
       hosts:

From f67f0c1c11b4f5226225195b5be956c154ea4200 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 23 Jan 2023 17:12:51 +0200
Subject: [PATCH 034/209] More tenant size fixes (#3410)

Small changes, but hopefully this will help with the panic detected in
staging, for which we cannot get the debugging information right now
(end-of-branch before branch-point).
---
 libs/tenant_size_model/src/lib.rs  |  83 ++++++++++++----------
 libs/tenant_size_model/src/main.rs | 107 +++++++++++++++--------------
 pageserver/src/tenant/size.rs      |  15 +++-
 3 files changed, 114 insertions(+), 91 deletions(-)

diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs
index 92bec8aebe..b156e1be9d 100644
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -134,22 +134,25 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
         op: Cow<'static, str>,
         lsn: u64,
         size: Option<u64>,
-    ) where
+    ) -> anyhow::Result<()>
+    where
         K: std::borrow::Borrow<Q>,
         Q: std::hash::Hash + Eq + std::fmt::Debug,
     {
-        let lastseg_id = *self.branches.get(branch).unwrap();
+        let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
         let newseg_id = self.segments.len();
         let lastseg = &mut self.segments[lastseg_id];
 
         assert!(lsn > lastseg.end_lsn);
 
+        let Some(start_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
+
         let newseg = Segment {
             op,
             parent: Some(lastseg_id),
             start_lsn: lastseg.end_lsn,
             end_lsn: lsn,
-            start_size: lastseg.end_size.unwrap(),
+            start_size,
             end_size: size,
             children_after: Vec::new(),
             needed: false,
@@ -158,6 +161,8 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
 
         self.segments.push(newseg);
         *self.branches.get_mut(branch).expect("read already") = newseg_id;
+
+        Ok(())
     }
 
     /// Advances the branch with the named operation, by the relative LSN and logical size bytes.
@@ -167,21 +172,24 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
         op: Cow<'static, str>,
         lsn_bytes: u64,
         size_bytes: i64,
-    ) where
+    ) -> anyhow::Result<()>
+    where
         K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
+        Q: std::hash::Hash + Eq + std::fmt::Debug,
     {
-        let lastseg_id = *self.branches.get(branch).unwrap();
+        let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
         let newseg_id = self.segments.len();
         let lastseg = &mut self.segments[lastseg_id];
 
+        let Some(last_end_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
+
         let newseg = Segment {
             op,
             parent: Some(lastseg_id),
             start_lsn: lastseg.end_lsn,
             end_lsn: lastseg.end_lsn + lsn_bytes,
-            start_size: lastseg.end_size.unwrap(),
-            end_size: Some((lastseg.end_size.unwrap() as i64 + size_bytes) as u64),
+            start_size: last_end_size,
+            end_size: Some((last_end_size as i64 + size_bytes) as u64),
             children_after: Vec::new(),
             needed: false,
         };
@@ -189,33 +197,33 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
 
         self.segments.push(newseg);
         *self.branches.get_mut(branch).expect("read already") = newseg_id;
+        Ok(())
     }
 
-    pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
+    pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
     where
         K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
+        Q: std::hash::Hash + Eq + std::fmt::Debug,
     {
-        self.modify_branch(branch, "insert".into(), bytes, bytes as i64);
+        self.modify_branch(branch, "insert".into(), bytes, bytes as i64)
     }
 
-    pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
+    pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
     where
         K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
+        Q: std::hash::Hash + Eq + std::fmt::Debug,
     {
-        self.modify_branch(branch, "update".into(), bytes, 0i64);
+        self.modify_branch(branch, "update".into(), bytes, 0i64)
     }
 
-    pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64)
+    pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
     where
         K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq,
+        Q: std::hash::Hash + Eq + std::fmt::Debug,
     {
-        self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64));
+        self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64))
     }
 
-    /// Panics if the parent branch cannot be found.
     pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K) -> anyhow::Result<()>
     where
         K: std::borrow::Borrow<Q> + std::fmt::Debug,
@@ -236,7 +244,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
         Ok(())
     }
 
-    pub fn calculate(&mut self, retention_period: u64) -> SegmentSize {
+    pub fn calculate(&mut self, retention_period: u64) -> anyhow::Result<SegmentSize> {
         // Phase 1: Mark all the segments that need to be retained
         for (_branch, &last_seg_id) in self.branches.iter() {
             let last_seg = &self.segments[last_seg_id];
@@ -261,7 +269,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
         self.size_from_snapshot_later(0)
     }
 
-    fn size_from_wal(&self, seg_id: usize) -> SegmentSize {
+    fn size_from_wal(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
         let seg = &self.segments[seg_id];
 
         let this_size = seg.end_lsn - seg.start_lsn;
@@ -272,10 +280,10 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
         for &child_id in seg.children_after.iter() {
             // try each child both ways
             let child = &self.segments[child_id];
-            let p1 = self.size_from_wal(child_id);
+            let p1 = self.size_from_wal(child_id)?;
 
             let p = if !child.needed {
-                let p2 = self.size_from_snapshot_later(child_id);
+                let p2 = self.size_from_snapshot_later(child_id)?;
                 if p1.total() < p2.total() {
                     p1
                 } else {
@@ -286,15 +294,15 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
             };
             children.push(p);
         }
-        SegmentSize {
+        Ok(SegmentSize {
             seg_id,
             method: if seg.needed { WalNeeded } else { Wal },
             this_size,
             children,
-        }
+        })
     }
 
-    fn size_from_snapshot_later(&self, seg_id: usize) -> SegmentSize {
+    fn size_from_snapshot_later(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
         // If this is needed, then it's time to do the snapshot and continue
         // with wal method.
         let seg = &self.segments[seg_id];
@@ -305,10 +313,10 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
             for &child_id in seg.children_after.iter() {
                 // try each child both ways
                 let child = &self.segments[child_id];
-                let p1 = self.size_from_wal(child_id);
+                let p1 = self.size_from_wal(child_id)?;
 
                 let p = if !child.needed {
-                    let p2 = self.size_from_snapshot_later(child_id);
+                    let p2 = self.size_from_snapshot_later(child_id)?;
                     if p1.total() < p2.total() {
                         p1
                     } else {
@@ -319,12 +327,12 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
                 };
                 children.push(p);
             }
-            SegmentSize {
+            Ok(SegmentSize {
                 seg_id,
                 method: WalNeeded,
                 this_size: seg.start_size,
                 children,
-            }
+            })
         } else {
             // If any of the direct children are "needed", need to be able to reconstruct here
             let mut children_needed = false;
@@ -339,7 +347,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
             let method1 = if !children_needed {
                 let mut children = Vec::new();
                 for child in seg.children_after.iter() {
-                    children.push(self.size_from_snapshot_later(*child));
+                    children.push(self.size_from_snapshot_later(*child)?);
                 }
                 Some(SegmentSize {
                     seg_id,
@@ -355,20 +363,25 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
             let method2 = if children_needed || seg.children_after.len() >= 2 {
                 let mut children = Vec::new();
                 for child in seg.children_after.iter() {
-                    children.push(self.size_from_wal(*child));
+                    children.push(self.size_from_wal(*child)?);
                 }
+                let Some(this_size) = seg.end_size else { anyhow::bail!("no end_size at junction {seg_id}") };
                 Some(SegmentSize {
                     seg_id,
                     method: SnapshotAfter,
-                    this_size: seg.end_size.unwrap(),
+                    this_size,
                     children,
                 })
             } else {
                 None
             };
 
-            match (method1, method2) {
-                (None, None) => panic!(),
+            Ok(match (method1, method2) {
+                (None, None) => anyhow::bail!(
+                    "neither method was applicable: children_after={}, children_needed={}",
+                    seg.children_after.len(),
+                    children_needed
+                ),
                 (Some(method), None) => method,
                 (None, Some(method)) => method,
                 (Some(method1), Some(method2)) => {
@@ -378,7 +391,7 @@ impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
                         method2
                     }
                 }
-            }
+            })
         }
     }
 
diff --git a/libs/tenant_size_model/src/main.rs b/libs/tenant_size_model/src/main.rs
index 9378a98a09..e32dd055f4 100644
--- a/libs/tenant_size_model/src/main.rs
+++ b/libs/tenant_size_model/src/main.rs
@@ -7,118 +7,118 @@
 use tenant_size_model::{Segment, SegmentSize, Storage};
 
 // Main branch only. Some updates on it.
-fn scenario_1() -> (Vec<Segment>, SegmentSize) {
+fn scenario_1() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
     // Create main branch
     let mut storage = Storage::new("main");
 
     // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
+    storage.insert("main", 5_000)?;
 
     // Stream of updates
     for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
     }
 
-    let size = storage.calculate(1000);
+    let size = storage.calculate(1000)?;
 
-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }
 
 // Main branch only. Some updates on it.
-fn scenario_2() -> (Vec<Segment>, SegmentSize) {
+fn scenario_2() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
     // Create main branch
     let mut storage = Storage::new("main");
 
     // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
+    storage.insert("main", 5_000)?;
 
     // Stream of updates
     for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
     }
 
     // Branch
-    storage.branch("main", "child").unwrap();
-    storage.update("child", 1_000);
+    storage.branch("main", "child")?;
+    storage.update("child", 1_000)?;
 
     // More updates on parent
-    storage.update("main", 1_000);
+    storage.update("main", 1_000)?;
 
-    let size = storage.calculate(1000);
+    let size = storage.calculate(1000)?;
 
-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }
 
 // Like 2, but more updates on main
-fn scenario_3() -> (Vec<Segment>, SegmentSize) {
+fn scenario_3() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
     // Create main branch
     let mut storage = Storage::new("main");
 
     // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
+    storage.insert("main", 5_000)?;
 
     // Stream of updates
     for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
     }
 
     // Branch
-    storage.branch("main", "child").unwrap();
-    storage.update("child", 1_000);
+    storage.branch("main", "child")?;
+    storage.update("child", 1_000)?;
 
     // More updates on parent
     for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
     }
 
-    let size = storage.calculate(1000);
+    let size = storage.calculate(1000)?;
 
-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }
 
 // Diverged branches
-fn scenario_4() -> (Vec<Segment>, SegmentSize) {
+fn scenario_4() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
     // Create main branch
     let mut storage = Storage::new("main");
 
     // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000);
+    storage.insert("main", 5_000)?;
 
     // Stream of updates
     for _ in 0..5 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
     }
 
     // Branch
-    storage.branch("main", "child").unwrap();
-    storage.update("child", 1_000);
+    storage.branch("main", "child")?;
+    storage.update("child", 1_000)?;
 
     // More updates on parent
     for _ in 0..8 {
-        storage.update("main", 1_000);
+        storage.update("main", 1_000)?;
     }
 
-    let size = storage.calculate(1000);
+    let size = storage.calculate(1000)?;
 
-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }
 
-fn scenario_5() -> (Vec<Segment>, SegmentSize) {
+fn scenario_5() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
     let mut storage = Storage::new("a");
-    storage.insert("a", 5000);
-    storage.branch("a", "b").unwrap();
-    storage.update("b", 4000);
-    storage.update("a", 2000);
-    storage.branch("a", "c").unwrap();
-    storage.insert("c", 4000);
-    storage.insert("a", 2000);
+    storage.insert("a", 5000)?;
+    storage.branch("a", "b")?;
+    storage.update("b", 4000)?;
+    storage.update("a", 2000)?;
+    storage.branch("a", "c")?;
+    storage.insert("c", 4000)?;
+    storage.insert("a", 2000)?;
 
-    let size = storage.calculate(5000);
+    let size = storage.calculate(5000)?;
 
-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }
 
-fn scenario_6() -> (Vec<Segment>, SegmentSize) {
+fn scenario_6() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
     use std::borrow::Cow;
 
     const NO_OP: Cow<'static, str> = Cow::Borrowed("");
@@ -133,18 +133,18 @@ fn scenario_6() -> (Vec<Segment>, SegmentSize) {
 
     let mut storage = Storage::new(None);
 
-    storage.branch(&None, branches[0]).unwrap(); // at 0
-    storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128); // at 108951064
-    storage.branch(&branches[0], branches[1]).unwrap(); // at 108951064
-    storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392); // at 124511472
-    storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904); // at 283415424
-    storage.branch(&branches[0], branches[2]).unwrap(); // at 283415424
-    storage.modify_branch(&branches[2], NO_OP, 15906192, 8192); // at 299321616
-    storage.modify_branch(&branches[0], NO_OP, 18909976, 32768); // at 302325400
+    storage.branch(&None, branches[0])?; // at 0
+    storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128)?; // at 108951064
+    storage.branch(&branches[0], branches[1])?; // at 108951064
+    storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392)?; // at 124511472
+    storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904)?; // at 283415424
+    storage.branch(&branches[0], branches[2])?; // at 283415424
+    storage.modify_branch(&branches[2], NO_OP, 15906192, 8192)?; // at 299321616
+    storage.modify_branch(&branches[0], NO_OP, 18909976, 32768)?; // at 302325400
 
-    let size = storage.calculate(100_000);
+    let size = storage.calculate(100_000)?;
 
-    (storage.into_segments(), size)
+    Ok((storage.into_segments(), size))
 }
 
 fn main() {
@@ -163,7 +163,8 @@ fn main() {
             eprintln!("invalid scenario {}", other);
             std::process::exit(1);
         }
-    };
+    }
+    .unwrap();
 
     graphviz_tree(&segments, &size);
 }
@@ -251,7 +252,7 @@ fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
 
 #[test]
 fn scenarios_return_same_size() {
-    type ScenarioFn = fn() -> (Vec<Segment>, SegmentSize);
+    type ScenarioFn = fn() -> anyhow::Result<(Vec<Segment>, SegmentSize)>;
     let truths: &[(u32, ScenarioFn, _)] = &[
         (line!(), scenario_1, 8000),
         (line!(), scenario_2, 9000),
@@ -262,7 +263,7 @@ fn scenarios_return_same_size() {
     ];
 
     for (line, scenario, expected) in truths {
-        let (_, size) = scenario();
+        let (_, size) = scenario().unwrap();
         assert_eq!(*expected, size.total_children(), "scenario on line {line}");
     }
 }
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 2181d6d4dc..61cb32fc76 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -194,6 +194,15 @@ pub(super) async fn gather_inputs(
 
     let timelines = tenant.list_timelines();
 
+    if timelines.is_empty() {
+        // perhaps the tenant has just been created, and as such doesn't have any data yet
+        return Ok(ModelInputs {
+            updates: vec![],
+            retention_period: 0,
+            timeline_inputs: HashMap::default(),
+        });
+    }
+
     // record the used/inserted cache keys here, to remove extras not to start leaking
     // after initial run the cache should be quite stable, but live timelines will eventually
     // require new lsns to be inspected.
@@ -505,10 +514,10 @@ impl ModelInputs {
             let Lsn(now) = *lsn;
             match op {
                 Command::Update(sz) => {
-                    storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz));
+                    storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz))?;
                 }
                 Command::EndOfBranch => {
-                    storage.insert_point(&Some(*timeline_id), "".into(), now, None);
+                    storage.insert_point(&Some(*timeline_id), "".into(), now, None)?;
                 }
                 Command::BranchFrom(parent) => {
                     // This branch command may fail if it cannot find a parent to branch from.
@@ -517,7 +526,7 @@ impl ModelInputs {
             }
         }
 
-        Ok(storage.calculate(self.retention_period).total_children())
+        Ok(storage.calculate(self.retention_period)?.total_children())
     }
 }
 

From eb36403e71210b1be7e2482fc385b8da8c149d5f Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Tue, 31 Jan 2023 14:06:35 +0100
Subject: [PATCH 035/209] Release 2023 01 31 (#3497)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Kirill Bulatov <kirill@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
Co-authored-by: bojanserafimov <bojan.serafimov7@gmail.com>
Co-authored-by: Christian Schwarz <christian@neon.tech>
Co-authored-by: Alexey Kondratov <kondratov.aleksey@gmail.com>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Konstantin Knizhnik <knizhnik@garret.ru>
Co-authored-by: Shany Pozin <shany@neon.tech>
Co-authored-by: Sergey Melnikov <sergey@neon.tech>
Co-authored-by: Dmitry Rodionov <dmitry@neon.tech>
Co-authored-by: Rory de Zoete <33318916+zoete@users.noreply.github.com>
Co-authored-by: Rory de Zoete <rdezoete@Rorys-Mac-Studio.fritz.box>
Co-authored-by: Rory de Zoete <rdezoete@RorysMacStudio.fritz.box>
Co-authored-by: Lassi Pölönen <lassi.polonen@iki.fi>
---
 .../actions/run-python-test-set/action.yml    |   4 +-
 .github/ansible/deploy.yaml                   |   6 +-
 .../dev-us-east-2-beta.neon-proxy-link.yaml   |   1 +
 ...prod-us-east-2-delta.neon-proxy-link.yaml} |  24 +-
 ...us-west-2-eta.neon-proxy-scram-legacy.yaml |  61 ++
 .github/workflows/build_and_test.yml          | 572 +++---------
 .github/workflows/deploy-dev.yml              | 179 ++++
 .github/workflows/deploy-prod.yml             | 277 ++++++
 .github/workflows/release.yml                 |  33 +
 Cargo.lock                                    | 328 ++++---
 Cargo.toml                                    |  10 +-
 ...ompute-node-v14 => Dockerfile.compute-node |  34 +-
 Dockerfile.compute-node-v15                   | 220 -----
 compute_tools/Cargo.toml                      |   3 +
 compute_tools/src/bin/compute_ctl.rs          |  32 +-
 compute_tools/src/http/api.rs                 |  27 +-
 compute_tools/src/logger.rs                   |  24 +-
 compute_tools/src/params.rs                   |   8 +-
 compute_tools/src/spec.rs                     |  23 +-
 libs/metrics/src/lib.rs                       |   1 +
 libs/pageserver_api/src/models.rs             |  37 +-
 libs/tracing-utils/Cargo.toml                 |  17 +
 libs/tracing-utils/src/http.rs                |  96 ++
 libs/tracing-utils/src/lib.rs                 | 168 ++++
 libs/utils/Cargo.toml                         |   1 +
 libs/utils/src/http/error.rs                  |  17 +-
 libs/utils/src/logging.rs                     |   2 +-
 pageserver/Cargo.toml                         |   3 +-
 pageserver/benches/bench_layer_map.rs         | 224 ++---
 pageserver/src/basebackup.rs                  |  50 +-
 pageserver/src/bin/pageserver.rs              |  57 +-
 pageserver/src/broker_client.rs               |  48 +
 pageserver/src/config.rs                      |  28 +
 pageserver/src/consumption_metrics.rs         |  24 +-
 pageserver/src/context.rs                     | 199 ++++
 pageserver/src/http/openapi_spec.yml          |  10 +-
 pageserver/src/http/routes.rs                 | 170 ++--
 pageserver/src/import_datadir.rs              |  52 +-
 pageserver/src/lib.rs                         |   3 +-
 pageserver/src/metrics.rs                     | 166 +++-
 pageserver/src/page_service.rs                | 223 +++--
 pageserver/src/pgdatadir_mapping.rs           | 169 ++--
 pageserver/src/repository.rs                  |  11 +
 pageserver/src/task_mgr.rs                    |  45 +-
 pageserver/src/tenant.rs                      | 576 ++++++------
 pageserver/src/tenant/config.rs               |   7 +-
 pageserver/src/tenant/layer_map.rs            | 877 +++++++++---------
 .../layer_map/historic_layer_coverage.rs      | 583 ++++++++++++
 .../src/tenant/layer_map/layer_coverage.rs    | 154 +++
 pageserver/src/tenant/mgr.rs                  | 236 +++--
 .../src/tenant/remote_timeline_client.rs      |  25 +-
 pageserver/src/tenant/size.rs                 |  16 +-
 pageserver/src/tenant/storage_layer.rs        |  47 +
 pageserver/src/tenant/tasks.rs                |  13 +-
 pageserver/src/tenant/timeline.rs             | 327 +++++--
 .../src/{ => tenant/timeline}/walreceiver.rs  |  44 -
 .../walreceiver/connection_manager.rs         |  55 +-
 .../walreceiver/walreceiver_connection.rs     |  29 +-
 pageserver/src/walingest.rs                   | 418 ++++++---
 pageserver/src/walredo.rs                     | 329 +++++--
 poetry.lock                                   | 247 ++++-
 proxy/src/main.rs                             |   4 +-
 pyproject.toml                                |   1 +
 safekeeper/src/bin/safekeeper.rs              |   7 +-
 scripts/force_layer_download.py               | 324 +++++++
 storage_broker/src/bin/storage_broker.rs      |   4 +-
 test_runner/fixtures/metrics.py               |  12 +-
 test_runner/regress/test_tenant_conf.py       |  55 +-
 test_runner/regress/test_tenant_detach.py     |  46 +-
 test_runner/regress/test_tenants.py           |  56 +-
 workspace_hack/Cargo.toml                     |   8 +-
 71 files changed, 5779 insertions(+), 2408 deletions(-)
 rename .github/helm-values/{production.proxy.yaml => prod-us-east-2-delta.neon-proxy-link.yaml} (80%)
 create mode 100644 .github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
 create mode 100644 .github/workflows/deploy-dev.yml
 create mode 100644 .github/workflows/deploy-prod.yml
 create mode 100644 .github/workflows/release.yml
 rename Dockerfile.compute-node-v14 => Dockerfile.compute-node (86%)
 delete mode 100644 Dockerfile.compute-node-v15
 create mode 100644 libs/tracing-utils/Cargo.toml
 create mode 100644 libs/tracing-utils/src/http.rs
 create mode 100644 libs/tracing-utils/src/lib.rs
 create mode 100644 pageserver/src/broker_client.rs
 create mode 100644 pageserver/src/context.rs
 create mode 100644 pageserver/src/tenant/layer_map/historic_layer_coverage.rs
 create mode 100644 pageserver/src/tenant/layer_map/layer_coverage.rs
 rename pageserver/src/{ => tenant/timeline}/walreceiver.rs (83%)
 rename pageserver/src/{ => tenant/timeline}/walreceiver/connection_manager.rs (96%)
 rename pageserver/src/{ => tenant/timeline}/walreceiver/walreceiver_connection.rs (94%)
 create mode 100644 scripts/force_layer_download.py

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 990c7e25a9..29b04a3478 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -123,8 +123,8 @@ runs:
           exit 1
         fi
         if [[ "${{ inputs.run_in_parallel }}" == "true" ]]; then
-          # -n4 uses four processes to run tests via pytest-xdist
-          EXTRA_PARAMS="-n4 $EXTRA_PARAMS"
+          # -n16 uses sixteen processes to run tests via pytest-xdist
+          EXTRA_PARAMS="-n16 $EXTRA_PARAMS"
 
           # --dist=loadgroup points tests marked with @pytest.mark.xdist_group
           # to the same worker to make @pytest.mark.order work with xdist
diff --git a/.github/ansible/deploy.yaml b/.github/ansible/deploy.yaml
index 4adc685684..a17dc9c78f 100644
--- a/.github/ansible/deploy.yaml
+++ b/.github/ansible/deploy.yaml
@@ -117,7 +117,8 @@
       shell:
         cmd: |
           INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/pageservers/$INSTANCE_ID
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/pageservers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/pageservers
       tags:
       - pageserver
 
@@ -186,6 +187,7 @@
       shell:
         cmd: |
           INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
-          curl -sfS -d '{"version": {{ current_version }} }' -X PATCH {{ console_mgmt_base_url }}/api/v1/safekeepers/$INSTANCE_ID
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" {{ console_mgmt_base_url }}/management/api/v2/safekeepers/$INSTANCE_ID | jq '.version = {{ current_version }}' > /tmp/new_version
+          curl -sfS -H "Authorization: Bearer {{ CONSOLE_API_TOKEN }}" -X POST -d@/tmp/new_version {{ console_mgmt_base_url }}/management/api/v2/safekeepers
       tags:
       - safekeeper
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
index cb062f705d..157ae66ed1 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -8,6 +8,7 @@ settings:
   authBackend: "link"
   authEndpoint: "https://console.stage.neon.tech/authenticate_proxy_request/"
   uri: "https://console.stage.neon.tech/psql_session/"
+  domain: "pg.neon.build"
   sentryEnvironment: "staging"
   metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
diff --git a/.github/helm-values/production.proxy.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
similarity index 80%
rename from .github/helm-values/production.proxy.yaml
rename to .github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
index dbaf3cd096..eff24302bb 100644
--- a/.github/helm-values/production.proxy.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-link.yaml
@@ -1,37 +1,37 @@
+# Helm chart values for neon-proxy-link.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
 settings:
   authBackend: "link"
   authEndpoint: "https://console.neon.tech/authenticate_proxy_request/"
   uri: "https://console.neon.tech/psql_session/"
+  domain: "pg.neon.tech"
   sentryEnvironment: "production"
 
 # -- Additional labels for zenith-proxy pods
 podLabels:
   zenith_service: proxy
   zenith_env: production
-  zenith_region: us-west-2
-  zenith_region_slug: oregon
+  zenith_region: us-east-2
+  zenith_region_slug: us-east-2
 
 service:
+  type: LoadBalancer
   annotations:
     service.beta.kubernetes.io/aws-load-balancer-type: external
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internal
-    external-dns.alpha.kubernetes.io/hostname: proxy-release.local
-  type: LoadBalancer
+    external-dns.alpha.kubernetes.io/hostname: neon-proxy-link-mgmt.delta.us-east-2.aws.neon.tech
 
 exposedService:
   annotations:
     service.beta.kubernetes.io/aws-load-balancer-type: external
     service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
     service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
-    external-dns.alpha.kubernetes.io/hostname: connect.neon.tech,pg.neon.tech
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
+    external-dns.alpha.kubernetes.io/hostname: neon-proxy-link.delta.us-east-2.aws.neon.tech
 
 extraManifests:
   - apiVersion: operator.victoriametrics.com/v1beta1
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
new file mode 100644
index 0000000000..3a5cde4b01
--- /dev/null
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
@@ -0,0 +1,61 @@
+# Helm chart values for neon-proxy-scram.
+# This is a YAML-formatted file.
+
+image:
+  repository: neondatabase/neon
+
+settings:
+  authBackend: "console"
+  authEndpoint: "http://console-release.local/management/api/v2"
+  domain: "*.cloud.neon.tech"
+  sentryEnvironment: "production"
+  wssPort: 8443
+  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionInterval: "10min"
+
+# -- Additional labels for neon-proxy pods
+podLabels:
+  zenith_service: proxy-scram
+  zenith_env: prod
+  zenith_region: us-west-2
+  zenith_region_slug: us-west-2
+
+exposedService:
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: external
+    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+    service.beta.kubernetes.io/aws-load-balancer-scheme: internet-facing
+    external-dns.alpha.kubernetes.io/hostname: neon-proxy-scram-legacy.eta.us-west-2.aws.neon.tech
+  httpsPort: 443
+
+#metrics:
+#  enabled: true
+#  serviceMonitor:
+#    enabled: true
+#    selector:
+#      release: kube-prometheus-stack
+
+extraManifests:
+  - apiVersion: operator.victoriametrics.com/v1beta1
+    kind: VMServiceScrape
+    metadata:
+      name: "{{ include \"neon-proxy.fullname\" . }}"
+      labels:
+        helm.sh/chart: neon-proxy-{{ .Chart.Version }}
+        app.kubernetes.io/name: neon-proxy
+        app.kubernetes.io/instance: "{{ include \"neon-proxy.fullname\" . }}"
+        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
+        app.kubernetes.io/managed-by: Helm
+      namespace: "{{ .Release.Namespace }}"
+    spec:
+      selector:
+        matchLabels:
+          app.kubernetes.io/name: "neon-proxy"
+      endpoints:
+        - port: http
+          path: /metrics
+          interval: 10s
+          scrapeTimeout: 10s
+      namespaceSelector:
+        matchNames:
+          - "{{ .Release.Namespace }}"
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 918e568e27..89e12360f9 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1,4 +1,4 @@
-name: Test and Deploy
+name: Build and Test
 
 on:
   push:
@@ -19,10 +19,12 @@ concurrency:
 env:
   RUST_BACKTRACE: 1
   COPT: '-Werror'
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
 jobs:
   tag:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
     outputs:
       build-tag: ${{steps.build-tag.outputs.tag}}
@@ -50,7 +52,7 @@ jobs:
         id: build-tag
 
   check-codestyle-python:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cloud:pinned
       options: --init
@@ -85,7 +87,7 @@ jobs:
         run: poetry run mypy .
 
   check-codestyle-rust:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -97,16 +99,16 @@ jobs:
           submodules: true
           fetch-depth: 1
 
-      - name: Restore cargo deps cache
-        id: cache_cargo
-        uses: actions/cache@v3
-        with:
-          path: |
-            ~/.cargo/registry/
-            !~/.cargo/registry/src
-            ~/.cargo/git/
-            target/
-          key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#      Disabled for now
+#      - name: Restore cargo deps cache
+#        id: cache_cargo
+#        uses: actions/cache@v3
+#        with:
+#          path: |
+#            !~/.cargo/registry/src
+#            ~/.cargo/git/
+#            target/
+#          key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
 
       # Some of our rust modules use FFI and need those to be checked
       - name: Get postgres headers
@@ -133,7 +135,7 @@ jobs:
         run: cargo deny check
 
   build-neon:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -141,7 +143,6 @@ jobs:
       fail-fast: false
       matrix:
         build_type: [ debug, release ]
-
     env:
       BUILD_TYPE: ${{ matrix.build_type }}
       GIT_VERSION: ${{ github.sha }}
@@ -194,24 +195,26 @@ jobs:
           echo "cov_prefix=${cov_prefix}" >> $GITHUB_ENV
           echo "CARGO_FEATURES=${CARGO_FEATURES}" >> $GITHUB_ENV
           echo "CARGO_FLAGS=${CARGO_FLAGS}" >> $GITHUB_ENV
+          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
 
+      # Disabled for now
       # Don't include the ~/.cargo/registry/src directory. It contains just
       # uncompressed versions of the crates in ~/.cargo/registry/cache
       # directory, and it's faster to let 'cargo' to rebuild it from the
       # compressed crates.
-      - name: Cache cargo deps
-        id: cache_cargo
-        uses: actions/cache@v3
-        with:
-          path: |
-            ~/.cargo/registry/
-            !~/.cargo/registry/src
-            ~/.cargo/git/
-            target/
-          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
-          key: |
-            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
-            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
+#      - name: Cache cargo deps
+#        id: cache_cargo
+#        uses: actions/cache@v3
+#        with:
+#          path: |
+#            ~/.cargo/registry/
+#            !~/.cargo/registry/src
+#            ~/.cargo/git/
+#            target/
+#          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
+#          key: |
+#            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
 
       - name: Cache postgres v14 build
         id: cache_pg_14
@@ -301,7 +304,7 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   regress-tests:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -334,7 +337,7 @@ jobs:
         uses: ./.github/actions/save-coverage-data
 
   benchmarks:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -365,7 +368,7 @@ jobs:
       # while coverage is currently collected for the debug ones
 
   merge-allure-report:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -402,7 +405,7 @@ jobs:
           DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
 
   coverage-report:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
@@ -418,16 +421,17 @@ jobs:
           submodules: true
           fetch-depth: 1
 
-      - name: Restore cargo deps cache
-        id: cache_cargo
-        uses: actions/cache@v3
-        with:
-          path: |
-            ~/.cargo/registry/
-            !~/.cargo/registry/src
-            ~/.cargo/git/
-            target/
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#      Disabled for now
+#      - name: Restore cargo deps cache
+#        id: cache_cargo
+#        uses: actions/cache@v3
+#        with:
+#          path: |
+#            ~/.cargo/registry/
+#            !~/.cargo/registry/src
+#            ~/.cargo/git/
+#            target/
+#          key: v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
 
       - name: Get Neon artifact
         uses: ./.github/actions/download
@@ -477,7 +481,7 @@ jobs:
             }"
 
   trigger-e2e-tests:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
       options: --init
@@ -522,9 +526,10 @@ jobs:
             }"
 
   neon-image:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     needs: [ tag ]
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    # https://github.com/GoogleContainerTools/kaniko/issues/2005
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
     defaults:
       run:
         shell: sh -eu {0}
@@ -540,12 +545,16 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build neon
-        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
+
+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ecr': Permission denied
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   compute-tools-image:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     needs: [ tag ]
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
     defaults:
       run:
         shell: sh -eu {0}
@@ -558,11 +567,14 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build compute tools
-        run: /kaniko/executor --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --snapshotMode=redo --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-tools --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}}
+
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   compute-node-image:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: gcr.io/kaniko-project/executor:v1.9.0-debug
+    runs-on: [ self-hosted, gen3, large ]
+    container: gcr.io/kaniko-project/executor:v1.7.0-debug
     needs: [ tag ]
     strategy:
       fail-fast: false
@@ -583,10 +595,13 @@ jobs:
         run: echo "{\"credsStore\":\"ecr-login\"}" > /kaniko/.docker/config.json
 
       - name: Kaniko build compute node with extensions
-        run: /kaniko/executor --skip-unused-stages  --snapshotMode=redo --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --dockerfile Dockerfile.compute-node-${{ matrix.version }} --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+        run: /kaniko/executor --reproducible --snapshotMode=redo --skip-unused-stages --cache=true --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache  --context . --build-arg GIT_VERSION=${{ github.sha }} --build-arg PG_VERSION=${{ matrix.version }} --dockerfile Dockerfile.compute-node --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   vm-compute-node-image:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, large ]
     needs: [ tag, compute-node-image ]
     strategy:
       fail-fast: false
@@ -631,7 +646,7 @@ jobs:
 
   test-images:
     needs: [ tag, neon-image, compute-node-image, compute-tools-image ]
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
 
     steps:
       - name: Checkout
@@ -673,20 +688,39 @@ jobs:
           docker compose -f ./docker-compose/docker-compose.yml down
 
   promote-images:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     needs: [ tag, test-images, vm-compute-node-image ]
+    container: golang:1.19-bullseye
     if: github.event_name != 'workflow_dispatch'
-    container: amazon/aws-cli
-    strategy:
-      fail-fast: false
-      matrix:
-        name: [ neon, compute-node-v14, vm-compute-node-v14, compute-node-v15, vm-compute-node-v15, compute-tools]
 
     steps:
-      - name: Promote image to latest
+      - name: Install Crane & ECR helper
+        if: |
+          (github.ref_name == 'main' || github.ref_name == 'release') &&
+          github.event_name != 'workflow_dispatch'
         run: |
-          export MANIFEST=$(aws ecr batch-get-image --repository-name ${{ matrix.name }} --image-ids imageTag=${{needs.tag.outputs.build-tag}} --query 'images[].imageManifest' --output text)
-          aws ecr put-image --repository-name ${{ matrix.name }} --image-tag latest --image-manifest "$MANIFEST"
+          go install github.com/google/go-containerregistry/cmd/crane@31786c6cbb82d6ec4fb8eb79cd9387905130534e # v0.11.0
+          go install github.com/awslabs/amazon-ecr-credential-helper/ecr-login/cli/docker-credential-ecr-login@69c85dc22db6511932bbf119e1a0cc5c90c69a7f # v0.6.0
+
+      - name: Configure ECR login
+        run: |
+          mkdir /github/home/.docker/
+          echo "{\"credsStore\":\"ecr-login\"}" > /github/home/.docker/config.json
+
+      - name: Add latest tag to images
+        if: |
+          (github.ref_name == 'main' || github.ref_name == 'release') &&
+          github.event_name != 'workflow_dispatch'
+        run: |
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-tools:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v14:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+          crane tag 369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
+
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   push-docker-hub:
     runs-on: [ self-hosted, dev, x64 ]
@@ -776,114 +810,11 @@ jobs:
           crane tag neondatabase/compute-node-v15:${{needs.tag.outputs.build-tag}} latest
           crane tag neondatabase/vm-compute-node-v15:${{needs.tag.outputs.build-tag}} latest
 
-  calculate-deploy-targets:
-    runs-on: [ self-hosted, dev, x64 ]
-    if: |
-      github.ref_name == 'release' &&
-      github.event_name != 'workflow_dispatch'
-    outputs:
-      matrix-include: ${{ steps.set-matrix.outputs.include }}
-    steps:
-      - id: set-matrix
-        run: |
-          if [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            PRODUCTION='{"env_name": "production", "proxy_job": "neon-proxy", "proxy_config": "production.proxy", "storage_broker_ns": "neon-storage-broker", "storage_broker_config": "production.neon-storage-broker", "kubeconfig_secret": "PRODUCTION_KUBECONFIG_DATA", "console_api_key_secret": "NEON_PRODUCTION_API_KEY"}'
-            echo "include=[$PRODUCTION]" >> $GITHUB_OUTPUT
-          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to 'release'"
-            exit 1
-          fi
-
-  deploy:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
-    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
-    if: |
-      github.ref_name == 'release' &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
-    environment:
-      name: prod-old
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Redeploy
-        run: |
-          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          cd "$(pwd)/.github/ansible"
-
-          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            ./get_binaries.sh
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            RELEASE=true ./get_binaries.sh
-          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
-            exit 1
-          fi
-
-          eval $(ssh-agent)
-          echo "${{ secrets.TELEPORT_SSH_KEY }}"  | tr -d '\n'| base64 --decode >ssh-key
-          echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
-          chmod 0600 ssh-key
-          ssh-add ssh-key
-          rm -f ssh-key ssh-key-cert.pub
-          ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
-          ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i ${{ matrix.env_name }}.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets[matrix.console_api_key_secret] }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
-
-  deploy-new:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
-    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'main') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        target_region: [ eu-west-1, us-east-2 ]
-    environment:
-      name: dev-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Redeploy
-        run: |
-          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          cd "$(pwd)/.github/ansible"
-          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            ./get_binaries.sh
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            RELEASE=true ./get_binaries.sh
-          else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
-            exit 1
-          fi
-          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
+      - name: Cleanup ECR folder
+        run: rm -rf ~/.ecr
 
   deploy-pr-test-new:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
     # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
     # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
@@ -915,311 +846,40 @@ jobs:
           ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
           rm -f neon_install.tar.gz .neon_current_version
 
-  deploy-prod-new:
-    runs-on: prod
-    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    # We need both storage **and** compute images for deploy, because control plane picks the compute version based on the storage version.
-    # If it notices a fresh storage it may bump the compute version. And if compute image failed to build it may break things badly
+      - name: Cleanup ansible folder
+        run: rm -rf ~/.ansible
+
+  deploy:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
     needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'release') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
-    environment:
-      name: prod-${{ matrix.target_region }}
+    if: ( github.ref_name == 'main' || github.ref_name == 'release' ) && github.event_name != 'workflow_dispatch'
     steps:
       - name: Checkout
         uses: actions/checkout@v3
         with:
-          submodules: true
+          submodules: false
           fetch-depth: 0
 
-      - name: Redeploy
+      - name: Trigger deploy workflow
+        env:
+          GH_TOKEN: ${{ github.token }}
         run: |
-          export DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          cd "$(pwd)/.github/ansible"
-
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-            ./get_binaries.sh
+            gh workflow run deploy-dev.yml --ref main -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            RELEASE=true ./get_binaries.sh
+            gh workflow run deploy-prod.yml --ref release -f branch=${{ github.sha }} -f dockerTag=${{needs.tag.outputs.build-tag}}
           else
             echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
             exit 1
           fi
 
-          ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
-
-  deploy-proxy:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
-    if: |
-      github.ref_name == 'release' &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
-    environment:
-      name: prod-old
-    env:
-      KUBECONFIG: .kubeconfig
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Add curl
-        run: apt update && apt install curl -y
-
-      - name: Store kubeconfig file
-        run: |
-          echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
-          chmod 0600 ${KUBECONFIG}
-
-      - name: Setup helm v3
-        run: |
-          curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-
-      - name: Re-deploy proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade ${{ matrix.proxy_job }}       neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}.yaml       --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-          helm upgrade ${{ matrix.proxy_job }}-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/${{ matrix.proxy_config }}-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-  deploy-storage-broker:
-    name: deploy storage broker on old staging and old prod
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, calculate-deploy-targets, tag, regress-tests ]
-    if: |
-      github.ref_name == 'release' &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include: ${{fromJSON(needs.calculate-deploy-targets.outputs.matrix-include)}}
-    environment:
-      name: prod-old
-    env:
-      KUBECONFIG: .kubeconfig
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Add curl
-        run: apt update && apt install curl -y
-
-      - name: Store kubeconfig file
-        run: |
-          echo "${{ secrets[matrix.kubeconfig_secret] }}" | base64 --decode > ${KUBECONFIG}
-          chmod 0600 ${KUBECONFIG}
-
-      - name: Setup helm v3
-        run: |
-          curl -s https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-
-      - name: Deploy storage-broker
-        run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace ${{ matrix.storage_broker_ns }} --create-namespace --install --atomic -f .github/helm-values/${{ matrix.storage_broker_config }}.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
-  deploy-proxy-new:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'main') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: dev-us-east-2-beta
-            deploy_link_proxy: true
-            deploy_legacy_scram_proxy: true
-          - target_region:  eu-west-1
-            target_cluster: dev-eu-west-1-zeta
-            deploy_link_proxy: false
-            deploy_legacy_scram_proxy: false
-    environment:
-      name: dev-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Re-deploy scram proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-      - name: Re-deploy link proxy
-        if: matrix.deploy_link_proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-      - name: Re-deploy legacy scram proxy
-        if: matrix.deploy_legacy_scram_proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-  deploy-storage-broker-dev-new:
-    runs-on: [ self-hosted, dev, x64 ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'main') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: dev-us-east-2-beta
-          - target_region:  eu-west-1
-            target_cluster: dev-eu-west-1-zeta
-    environment:
-      name: dev-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Deploy storage-broker
-        run:
-          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
-  deploy-proxy-prod-new:
-    runs-on: prod
-    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'release') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: prod-us-east-2-delta
-          - target_region:  us-west-2
-            target_cluster: prod-us-west-2-eta
-          - target_region: eu-central-1
-            target_cluster: prod-eu-central-1-gamma
-          - target_region: ap-southeast-1
-            target_cluster: prod-ap-southeast-1-epsilon
-    environment:
-      name: prod-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Re-deploy proxy
-        run: |
-          DOCKER_TAG=${{needs.tag.outputs.build-tag}}
-          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
-
-  deploy-storage-broker-prod-new:
-    runs-on: prod
-    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-    # Compute image isn't strictly required for proxy deploy, but let's still wait for it to run all deploy jobs consistently.
-    needs: [ push-docker-hub, tag, regress-tests ]
-    if: |
-      (github.ref_name == 'release') &&
-      github.event_name != 'workflow_dispatch'
-    defaults:
-      run:
-        shell: bash
-    strategy:
-      matrix:
-        include:
-          - target_region:  us-east-2
-            target_cluster: prod-us-east-2-delta
-          - target_region:  us-west-2
-            target_cluster: prod-us-west-2-eta
-          - target_region: eu-central-1
-            target_cluster: prod-eu-central-1-gamma
-          - target_region: ap-southeast-1
-            target_cluster: prod-ap-southeast-1-epsilon
-    environment:
-      name: prod-${{ matrix.target_region }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-
-      - name: Configure environment
-        run: |
-          helm repo add neondatabase https://neondatabase.github.io/helm-charts
-          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
-
-      - name: Deploy storage-broker
-        run:
-          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ needs.tag.outputs.build-tag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
   promote-compatibility-data:
-    runs-on: [ self-hosted, dev, x64 ]
+    runs-on: [ self-hosted, gen3, small ]
     container:
       image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
       options: --init
-    needs: [ deploy, deploy-proxy ]
+    needs: [ push-docker-hub, tag, regress-tests ]
     if: github.ref_name == 'release' && github.event_name != 'workflow_dispatch'
     steps:
       - name: Promote compatibility snapshot for the release
diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml
new file mode 100644
index 0000000000..409517bf63
--- /dev/null
+++ b/.github/workflows/deploy-dev.yml
@@ -0,0 +1,179 @@
+name: Neon Deploy dev
+
+on:
+  workflow_dispatch:
+    inputs:
+      dockerTag:
+        description: 'Docker tag to deploy'
+        required: true
+        type: string
+      branch:
+        description: 'Branch or commit used for deploy scripts and configs'
+        required: true
+        type: string
+        default: 'main'
+      deployStorage:
+        description: 'Deploy storage'
+        required: true
+        type: boolean
+        default: true
+      deployProxy:
+        description: 'Deploy proxy'
+        required: true
+        type: boolean
+        default: true
+      deployStorageBroker:
+        description: 'Deploy storage-broker'
+        required: true
+        type: boolean
+        default: true
+
+env:
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+concurrency:
+  group: deploy-dev
+  cancel-in-progress: false
+
+jobs:
+  deploy-storage-new:
+    runs-on: [ self-hosted, gen3, small ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+      options: --user root --privileged
+    if: inputs.deployStorage
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        target_region: [ eu-west-1, us-east-2 ]
+    environment:
+      name: dev-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{ inputs.dockerTag }}
+          cd "$(pwd)/.github/ansible"
+
+          ./get_binaries.sh
+
+          ansible-galaxy collection install sivel.toiletwater
+          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          rm -f neon_install.tar.gz .neon_current_version
+
+      - name: Cleanup ansible folder
+        run: rm -rf ~/.ansible
+
+  deploy-proxy-new:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployProxy
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: dev-us-east-2-beta
+            deploy_link_proxy: true
+            deploy_legacy_scram_proxy: true
+          - target_region:  eu-west-1
+            target_cluster: dev-eu-west-1-zeta
+            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: false
+    environment:
+      name: dev-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+  
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1-node16
+        with:
+          role-to-assume: arn:aws:iam::369495373322:role/github-runner
+          aws-region: eu-central-1
+          role-skip-session-tagging: true
+          role-duration-seconds: 1800
+  
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+  
+      - name: Re-deploy scram proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+  
+      - name: Re-deploy link proxy
+        if: matrix.deploy_link_proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+  
+      - name: Re-deploy legacy scram proxy
+        if: matrix.deploy_legacy_scram_proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+  
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
+  
+  deploy-storage-broker-new:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployStorageBroker
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: dev-us-east-2-beta
+          - target_region:  eu-west-1
+            target_cluster: dev-eu-west-1-zeta
+    environment:
+      name: dev-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+  
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1-node16
+        with:
+          role-to-assume: arn:aws:iam::369495373322:role/github-runner
+          aws-region: eu-central-1
+          role-skip-session-tagging: true
+          role-duration-seconds: 1800
+  
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+  
+      - name: Deploy storage-broker
+        run:
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+  
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml
new file mode 100644
index 0000000000..e1954b5540
--- /dev/null
+++ b/.github/workflows/deploy-prod.yml
@@ -0,0 +1,277 @@
+name: Neon Deploy prod
+
+on:
+  workflow_dispatch:
+    inputs:
+      dockerTag:
+        description: 'Docker tag to deploy'
+        required: true
+        type: string
+      branch:
+        description: 'Branch or commit used for deploy scripts and configs'
+        required: true
+        type: string
+        default: 'main'
+      deployStorage:
+        description: 'Deploy storage'
+        required: true
+        type: boolean
+        default: true
+      deployProxy:
+        description: 'Deploy proxy'
+        required: true
+        type: boolean
+        default: true
+      deployStorageBroker:
+        description: 'Deploy storage-broker'
+        required: true
+        type: boolean
+        default: true
+
+concurrency:
+  group: deploy-prod
+  cancel-in-progress: false
+
+jobs:
+  deploy-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    if: inputs.deployStorage
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        target_region: [ us-east-2, us-west-2, eu-central-1, ap-southeast-1 ]
+    environment:
+      name: prod-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{ inputs.dockerTag }}
+          cd "$(pwd)/.github/ansible"
+
+          ./get_binaries.sh
+
+          ansible-galaxy collection install sivel.toiletwater
+          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          rm -f neon_install.tar.gz .neon_current_version
+
+  deploy-proxy-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    if: inputs.deployProxy
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: prod-us-east-2-delta
+            deploy_link_proxy: true
+            deploy_legacy_scram_proxy: false
+          - target_region:  us-west-2
+            target_cluster: prod-us-west-2-eta
+            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: true
+          - target_region: eu-central-1
+            target_cluster: prod-eu-central-1-gamma
+            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: false
+          - target_region: ap-southeast-1
+            target_cluster: prod-ap-southeast-1-epsilon
+            deploy_link_proxy: false
+            deploy_legacy_scram_proxy: false
+    environment:
+      name: prod-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+
+      - name: Re-deploy scram proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+      - name: Re-deploy link proxy
+        if: matrix.deploy_link_proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-link neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-link.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+      - name: Re-deploy legacy scram proxy
+        if: matrix.deploy_legacy_scram_proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram-legacy neondatabase/neon-proxy --namespace neon-proxy --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-proxy-scram-legacy.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+  deploy-storage-broker-prod-new:
+    runs-on: prod
+    container: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    if: inputs.deployStorageBroker
+    defaults:
+      run:
+        shell: bash
+    strategy:
+      matrix:
+        include:
+          - target_region:  us-east-2
+            target_cluster: prod-us-east-2-delta
+          - target_region:  us-west-2
+            target_cluster: prod-us-west-2-eta
+          - target_region: eu-central-1
+            target_cluster: prod-eu-central-1-gamma
+          - target_region: ap-southeast-1
+            target_cluster: prod-ap-southeast-1-epsilon
+    environment:
+      name: prod-${{ matrix.target_region }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Configure environment
+        run: |
+          helm repo add neondatabase https://neondatabase.github.io/helm-charts
+          aws --region ${{ matrix.target_region }} eks update-kubeconfig --name  ${{ matrix.target_cluster }}
+
+      - name: Deploy storage-broker
+        run:
+          helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+
+  # Deploy to old account below          
+
+  deploy:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployStorage
+    defaults:
+      run:
+        shell: bash
+    environment:
+      name: prod-old
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Redeploy
+        run: |
+          export DOCKER_TAG=${{ inputs.dockerTag }}
+          cd "$(pwd)/.github/ansible"
+
+          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
+            ./get_binaries.sh
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+            RELEASE=true ./get_binaries.sh
+          else
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            exit 1
+          fi
+
+          eval $(ssh-agent)
+          echo "${{ secrets.TELEPORT_SSH_KEY }}"  | tr -d '\n'| base64 --decode >ssh-key
+          echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
+          chmod 0600 ssh-key
+          ssh-add ssh-key
+          rm -f ssh-key ssh-key-cert.pub
+          ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
+          ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i production.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          rm -f neon_install.tar.gz .neon_current_version
+
+      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ansible/collections': Permission denied
+      - name: Cleanup ansible folder
+        run: rm -rf ~/.ansible
+
+  deploy-proxy:
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployProxy
+    defaults:
+      run:
+        shell: bash
+    environment:
+      name: prod-old
+    env:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Store kubeconfig file
+        run: |
+          echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
+          chmod 0600 ${KUBECONFIG}
+
+      - name: Add neon helm chart
+        run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
+
+      - name: Re-deploy proxy
+        run: |
+          DOCKER_TAG=${{ inputs.dockerTag }}
+          helm upgrade neon-proxy-scram neondatabase/neon-proxy --namespace neon-proxy --install --atomic -f .github/helm-values/production.proxy-scram.yaml --set image.tag=${DOCKER_TAG} --set settings.sentryUrl=${{ secrets.SENTRY_URL_PROXY }} --wait --timeout 15m0s
+
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
+
+  deploy-storage-broker:
+    name: deploy storage broker on old staging and old prod
+    runs-on: [ self-hosted, gen3, small ]
+    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
+    if: inputs.deployStorageBroker
+    defaults:
+      run:
+        shell: bash
+    environment:
+      name: prod-old
+    env:
+      KUBECONFIG: .kubeconfig
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: true
+          fetch-depth: 0
+          ref: ${{ inputs.branch }}
+
+      - name: Store kubeconfig file
+        run: |
+          echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
+          chmod 0600 ${KUBECONFIG}
+
+      - name: Add neon helm chart
+        run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
+
+      - name: Deploy storage-broker
+        run:
+          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/production.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
+
+      - name: Cleanup helm folder
+        run: rm -rf ~/.cache
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 0000000000..49e04ee001
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,33 @@
+name: Create Release Branch
+
+on:
+  schedule:
+    - cron: '0 10 * * 2'
+
+jobs:
+  create_release_branch:
+    runs-on: [ubuntu-latest]
+
+    steps:
+    - name: Check out code
+      uses: actions/checkout@v3
+      with:
+        ref: main
+
+    - name: Get current date
+      id: date
+      run: echo "date=(date +'%Y-%m-%d')" >> $GITHUB_OUTPUT
+
+    - name: Create release branch
+      run: git checkout -b release/${{ steps.date.outputs.date }}
+
+    - name: Push new branch
+      run: git push origin release/${{ steps.date.outputs.date }}
+
+    - name: Create pull request into release
+      uses: thomaseizinger/create-pull-request@e3972219c86a56550fb70708d96800d8e24ba862 # 1.3.0
+      with:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        head: release/${{ steps.date.outputs.date }}
+        base: release
+        title: Release ${{ steps.date.outputs.date }}
diff --git a/Cargo.lock b/Cargo.lock
index d8aba9ba68..2985a654f3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -37,11 +37,6 @@ dependencies = [
  "memchr",
 ]
 
-[[package]]
-name = "amplify_num"
-version = "0.4.1"
-source = "git+https://github.com/rust-amplify/rust-amplify.git?tag=v4.0.0-beta.1#3ad006cf2804e1862ec7725a7684a493f3023523"
-
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
@@ -66,6 +61,15 @@ dependencies = [
  "backtrace",
 ]
 
+[[package]]
+name = "archery"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a8da9bc4c4053ee067669762bcaeea6e241841295a2b6c948312dad6ef4cc02"
+dependencies = [
+ "static_assertions",
+]
+
 [[package]]
 name = "asn1-rs"
 version = "0.5.1"
@@ -137,15 +141,6 @@ dependencies = [
  "syn",
 ]
 
-[[package]]
-name = "atomic-polyfill"
-version = "0.1.11"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3ff7eb3f316534d83a8a2c3d1674ace8a5a71198eba31e2e2b597833f699b28"
-dependencies = [
- "critical-section",
-]
-
 [[package]]
 name = "atty"
 version = "0.2.14"
@@ -629,9 +624,9 @@ dependencies = [
 
 [[package]]
 name = "bumpalo"
-version = "3.11.1"
+version = "3.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba"
+checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
 
 [[package]]
 name = "byteorder"
@@ -750,13 +745,13 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.0.32"
+version = "4.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a7db700bc935f9e43e88d00b0850dae18a63773cfbec6d8e070fccf7fef89a39"
+checksum = "4ec7a4128863c188deefe750ac1d1dfe66c236909f845af04beed823638dc1b2"
 dependencies = [
  "bitflags",
  "clap_derive",
- "clap_lex 0.3.0",
+ "clap_lex 0.3.1",
  "is-terminal",
  "once_cell",
  "strsim",
@@ -765,9 +760,9 @@ dependencies = [
 
 [[package]]
 name = "clap_derive"
-version = "4.0.21"
+version = "4.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0177313f9f02afc995627906bbd8967e2be069f5261954222dac78290c2b9014"
+checksum = "684a277d672e91966334af371f1a7b5833f9aa00b07c84e92fbce95e00208ce8"
 dependencies = [
  "heck",
  "proc-macro-error",
@@ -787,9 +782,9 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.3.0"
+version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0d4198f73e42b4936b35b5bb248d81d2b595ecb170da0bac7655c54eedfa8da8"
+checksum = "783fe232adfca04f90f56201b26d79682d4cd2625e0bc7290b95123afe558ade"
 dependencies = [
  "os_str_bytes",
 ]
@@ -832,10 +827,11 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
  "futures",
  "hyper",
  "notify",
+ "opentelemetry",
  "postgres",
  "regex",
  "serde",
@@ -844,7 +840,9 @@ dependencies = [
  "tokio",
  "tokio-postgres",
  "tracing",
+ "tracing-opentelemetry",
  "tracing-subscriber",
+ "tracing-utils",
  "url",
  "workspace_hack",
 ]
@@ -887,7 +885,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.0.32",
+ "clap 4.1.1",
  "comfy-table",
  "git-version",
  "nix",
@@ -988,12 +986,6 @@ dependencies = [
  "itertools",
 ]
 
-[[package]]
-name = "critical-section"
-version = "1.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6548a0ad5d2549e111e1f6a11a6c2e2d00ce6a3dafe22948d67c2b443f775e52"
-
 [[package]]
 name = "crossbeam-channel"
 version = "0.5.6"
@@ -1030,12 +1022,11 @@ dependencies = [
 
 [[package]]
 name = "crossbeam-utils"
-version = "0.8.11"
+version = "0.8.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "51887d4adc7b564537b15adcfb307936f8075dfcd5f00dde9a9f1d29383682bc"
+checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f"
 dependencies = [
  "cfg-if",
- "once_cell",
 ]
 
 [[package]]
@@ -1152,6 +1143,19 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "dashmap"
+version = "5.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
+dependencies = [
+ "cfg-if",
+ "hashbrown 0.12.3",
+ "lock_api",
+ "once_cell",
+ "parking_lot_core",
+]
+
 [[package]]
 name = "data-encoding"
 version = "2.3.3"
@@ -1506,15 +1510,6 @@ version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
 
-[[package]]
-name = "hash32"
-version = "0.2.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0c35f58762feb77d74ebe43bdbc3210f09be9fe6742234d573bacc26ed92b67"
-dependencies = [
- "byteorder",
-]
-
 [[package]]
 name = "hashbrown"
 version = "0.12.3"
@@ -1530,19 +1525,6 @@ dependencies = [
  "ahash",
 ]
 
-[[package]]
-name = "heapless"
-version = "0.7.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "db04bc24a18b9ea980628ecf00e6c0264f3c1426dac36c00cb49b6fbad8b0743"
-dependencies = [
- "atomic-polyfill",
- "hash32",
- "rustc_version",
- "spin 0.9.4",
- "stable_deref_trait",
-]
-
 [[package]]
 name = "heck"
 version = "0.4.0"
@@ -1804,9 +1786,9 @@ dependencies = [
 
 [[package]]
 name = "io-lifetimes"
-version = "1.0.3"
+version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46112a93252b123d31a119a8d1a1ac19deac4fac6e0e8b0df58f0d4e5870e63c"
+checksum = "e7d6c6f8c91b4b9ed43484ad1a938e393caf35960fce7f82a040497207bd8e9e"
 dependencies = [
  "libc",
  "windows-sys",
@@ -1916,12 +1898,6 @@ dependencies = [
  "winapi",
 ]
 
-[[package]]
-name = "libm"
-version = "0.2.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb"
-
 [[package]]
 name = "link-cplusplus"
 version = "1.0.8"
@@ -2067,9 +2043,9 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 
 [[package]]
 name = "nix"
-version = "0.26.1"
+version = "0.26.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46a58d1d356c6597d08cde02c2f09d785b09e28711837b1ed667dc652c08a694"
+checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a"
 dependencies = [
  "bitflags",
  "cfg-if",
@@ -2081,9 +2057,9 @@ dependencies = [
 
 [[package]]
 name = "nom"
-version = "7.1.2"
+version = "7.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e5507769c4919c998e69e49c839d9dc6e693ede4cc4290d6ad8b41d4f09c548c"
+checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
 dependencies = [
  "memchr",
  "minimal-lexical",
@@ -2154,7 +2130,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
 dependencies = [
  "autocfg",
- "libm",
 ]
 
 [[package]]
@@ -2203,6 +2178,108 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
+[[package]]
+name = "opentelemetry"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69d6c3d7288a106c0a363e4b0e8d308058d56902adefb16f4936f417ffef086e"
+dependencies = [
+ "opentelemetry_api",
+ "opentelemetry_sdk",
+]
+
+[[package]]
+name = "opentelemetry-http"
+version = "0.7.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1edc79add46364183ece1a4542592ca593e6421c60807232f5b8f7a31703825d"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "http",
+ "opentelemetry_api",
+ "reqwest",
+]
+
+[[package]]
+name = "opentelemetry-otlp"
+version = "0.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d1c928609d087790fc936a1067bdc310ae702bdf3b090c3f281b713622c8bbde"
+dependencies = [
+ "async-trait",
+ "futures",
+ "futures-util",
+ "http",
+ "opentelemetry",
+ "opentelemetry-http",
+ "opentelemetry-proto",
+ "prost",
+ "reqwest",
+ "thiserror",
+]
+
+[[package]]
+name = "opentelemetry-proto"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d61a2f56df5574508dd86aaca016c917489e589ece4141df1b5e349af8d66c28"
+dependencies = [
+ "futures",
+ "futures-util",
+ "opentelemetry",
+ "prost",
+ "tonic",
+ "tonic-build",
+]
+
+[[package]]
+name = "opentelemetry-semantic-conventions"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9b02e0230abb0ab6636d18e2ba8fa02903ea63772281340ccac18e0af3ec9eeb"
+dependencies = [
+ "opentelemetry",
+]
+
+[[package]]
+name = "opentelemetry_api"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c24f96e21e7acc813c7a8394ee94978929db2bcc46cf6b5014fc612bf7760c22"
+dependencies = [
+ "fnv",
+ "futures-channel",
+ "futures-util",
+ "indexmap",
+ "js-sys",
+ "once_cell",
+ "pin-project-lite",
+ "thiserror",
+]
+
+[[package]]
+name = "opentelemetry_sdk"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ca41c4933371b61c2a2f214bf16931499af4ec90543604ec828f7a625c09113"
+dependencies = [
+ "async-trait",
+ "crossbeam-channel",
+ "dashmap",
+ "fnv",
+ "futures-channel",
+ "futures-executor",
+ "futures-util",
+ "once_cell",
+ "opentelemetry_api",
+ "percent-encoding",
+ "rand",
+ "thiserror",
+ "tokio",
+ "tokio-stream",
+]
+
 [[package]]
 name = "os_info"
 version = "3.5.1"
@@ -2230,14 +2307,13 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
 name = "pageserver"
 version = "0.1.0"
 dependencies = [
- "amplify_num",
  "anyhow",
  "async-stream",
  "async-trait",
  "byteorder",
  "bytes",
  "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
  "close_fds",
  "const_format",
  "consumption_metrics",
@@ -2269,7 +2345,7 @@ dependencies = [
  "regex",
  "remote_storage",
  "reqwest",
- "rstar",
+ "rpds",
  "scopeguard",
  "serde",
  "serde_json",
@@ -2581,9 +2657,9 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.49"
+version = "1.0.50"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "57a8eca9f9c4ffde41714334dee777596264c7825420f521abc92b5b5deb63a5"
+checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2"
 dependencies = [
  "unicode-ident",
 ]
@@ -2683,7 +2759,7 @@ dependencies = [
  "bstr",
  "bytes",
  "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
  "consumption_metrics",
  "futures",
  "git-version",
@@ -2742,14 +2818,13 @@ dependencies = [
 
 [[package]]
 name = "rand"
-version = "0.8.4"
+version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8"
+checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
 dependencies = [
  "libc",
  "rand_chacha",
  "rand_core",
- "rand_hc",
 ]
 
 [[package]]
@@ -2771,15 +2846,6 @@ dependencies = [
  "getrandom",
 ]
 
-[[package]]
-name = "rand_hc"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7"
-dependencies = [
- "rand_core",
-]
-
 [[package]]
 name = "rayon"
 version = "1.6.1"
@@ -2930,7 +2996,7 @@ dependencies = [
  "cc",
  "libc",
  "once_cell",
- "spin 0.5.2",
+ "spin",
  "untrusted",
  "web-sys",
  "winapi",
@@ -2950,14 +3016,12 @@ dependencies = [
 ]
 
 [[package]]
-name = "rstar"
-version = "0.9.3"
+name = "rpds"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b40f1bfe5acdab44bc63e6699c28b74f75ec43afb59f3eda01e145aff86a25fa"
+checksum = "66262ea963eff99163e6b741fbc3417a52cc13074728c1047e9911789df9b000"
 dependencies = [
- "heapless",
- "num-traits",
- "smallvec",
+ "archery",
 ]
 
 [[package]]
@@ -3018,9 +3082,9 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.36.6"
+version = "0.36.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4feacf7db682c6c329c4ede12649cd36ecab0f3be5b7d74e6a20304725db4549"
+checksum = "d4fdebc4b395b7fbb9ab11e462e20ed9051e7b16e42d24042c776eca0ac81b03"
 dependencies = [
  "bitflags",
  "errno",
@@ -3093,7 +3157,7 @@ dependencies = [
  "async-trait",
  "byteorder",
  "bytes",
- "clap 4.0.32",
+ "clap 4.1.1",
  "const_format",
  "crc32c",
  "fs2",
@@ -3479,21 +3543,6 @@ version = "0.5.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
 
-[[package]]
-name = "spin"
-version = "0.9.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09"
-dependencies = [
- "lock_api",
-]
-
-[[package]]
-name = "stable_deref_trait"
-version = "1.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
-
 [[package]]
 name = "static_assertions"
 version = "1.1.0"
@@ -3507,7 +3556,7 @@ dependencies = [
  "anyhow",
  "async-stream",
  "bytes",
- "clap 4.0.32",
+ "clap 4.1.1",
  "const_format",
  "futures",
  "futures-core",
@@ -3639,9 +3688,9 @@ dependencies = [
 
 [[package]]
 name = "termcolor"
-version = "1.1.3"
+version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bab24d30b911b2376f3a13cc2cd443142f0c81dda04c118693e35b3835757755"
+checksum = "be55cf8942feac5c765c2c993422806843c9a9a45d4d5c407ad6dd2ea95eb9b6"
 dependencies = [
  "winapi-util",
 ]
@@ -3749,9 +3798,9 @@ dependencies = [
 
 [[package]]
 name = "tokio"
-version = "1.24.1"
+version = "1.24.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d9f76183f91ecfb55e1d7d5602bd1d979e38a3a522fe900241cf195624d67ae"
+checksum = "597a12a59981d9e3c38d216785b0c37399f6e415e8d0712047620f189371b0bb"
 dependencies = [
  "autocfg",
  "bytes",
@@ -4071,6 +4120,20 @@ dependencies = [
  "tracing-core",
 ]
 
+[[package]]
+name = "tracing-opentelemetry"
+version = "0.18.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21ebb87a95ea13271332df069020513ab70bdb5637ca42d6e492dc3bbbad48de"
+dependencies = [
+ "once_cell",
+ "opentelemetry",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
+ "tracing-subscriber",
+]
+
 [[package]]
 name = "tracing-serde"
 version = "0.1.3"
@@ -4102,6 +4165,22 @@ dependencies = [
  "tracing-serde",
 ]
 
+[[package]]
+name = "tracing-utils"
+version = "0.1.0"
+dependencies = [
+ "hyper",
+ "opentelemetry",
+ "opentelemetry-otlp",
+ "opentelemetry-semantic-conventions",
+ "reqwest",
+ "tokio",
+ "tracing",
+ "tracing-opentelemetry",
+ "tracing-subscriber",
+ "workspace_hack",
+]
+
 [[package]]
 name = "try-lock"
 version = "0.2.4"
@@ -4183,9 +4262,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a"
 
 [[package]]
 name = "ureq"
-version = "2.6.1"
+version = "2.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "733b5ad78377302af52c0dbcb2623d78fe50e4b3bf215948ff29e9ee031d8566"
+checksum = "338b31dd1314f68f3aabf3ed57ab922df95ffcd902476ca7ba3c4ce7b908c46d"
 dependencies = [
  "base64 0.13.1",
  "log",
@@ -4226,6 +4305,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "async-trait",
+ "atty",
  "bincode",
  "byteorder",
  "bytes",
@@ -4287,7 +4367,7 @@ name = "wal_craft"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "clap 4.0.32",
+ "clap 4.1.1",
  "env_logger",
  "log",
  "once_cell",
@@ -4534,11 +4614,13 @@ dependencies = [
  "anyhow",
  "bytes",
  "chrono",
- "clap 4.0.32",
+ "clap 4.1.1",
  "crossbeam-utils",
  "either",
  "fail",
+ "futures",
  "futures-channel",
+ "futures-executor",
  "futures-task",
  "futures-util",
  "indexmap",
@@ -4554,6 +4636,9 @@ dependencies = [
  "rand",
  "regex",
  "regex-syntax",
+ "reqwest",
+ "ring",
+ "rustls",
  "scopeguard",
  "serde",
  "serde_json",
@@ -4561,6 +4646,7 @@ dependencies = [
  "syn",
  "tokio",
  "tokio-util",
+ "tonic",
  "tower",
  "tracing",
  "tracing-core",
diff --git a/Cargo.toml b/Cargo.toml
index 74cc16d690..e6695c4246 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -61,6 +61,10 @@ nix = "0.26"
 notify = "5.0.0"
 num-traits = "0.2.15"
 once_cell = "1.13"
+opentelemetry = "0.18.0"
+opentelemetry-otlp = { version = "0.11.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.10.0"
+tracing-opentelemetry = "0.18.0"
 parking_lot = "0.12"
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
@@ -69,7 +73,7 @@ rand = "0.8"
 regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 routerify = "3"
-rstar = "0.9.3"
+rpds = "0.12.0"
 rustls = "0.20"
 rustls-pemfile = "1"
 rustls-split = "0.3"
@@ -107,9 +111,6 @@ x509-parser = "0.14"
 env_logger = "0.10"
 log = "0.4"
 
-## TODO switch when the new release is made
-amplify_num = { git = "https://github.com/rust-amplify/rust-amplify.git", tag = "v4.0.0-beta.1" }
-
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
 postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
 postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="43e6db254a97fdecbce33d8bc0890accfd74495e" }
@@ -128,6 +129,7 @@ remote_storage = { version = "0.1", path = "./libs/remote_storage/" }
 safekeeper_api = { version = "0.1", path = "./libs/safekeeper_api" }
 storage_broker = { version = "0.1", path = "./storage_broker/" } # Note: main broker code is inside the binary crate, so linking with the library shouldn't be heavy.
 tenant_size_model = { version = "0.1", path = "./libs/tenant_size_model/" }
+tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 
 ## Common library dependency
diff --git a/Dockerfile.compute-node-v14 b/Dockerfile.compute-node
similarity index 86%
rename from Dockerfile.compute-node-v14
rename to Dockerfile.compute-node
index 2deb95a93f..936f368833 100644
--- a/Dockerfile.compute-node-v14
+++ b/Dockerfile.compute-node
@@ -1,8 +1,5 @@
-#
-# This file is identical to the Dockerfile.compute-node-v15 file
-# except for the version of Postgres that is built.
-#
-
+ARG REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
+ARG IMAGE=rust
 ARG TAG=pinned
 
 #########################################################################################
@@ -22,7 +19,8 @@ RUN apt update &&  \
 #
 #########################################################################################
 FROM build-deps AS pg-build
-COPY vendor/postgres-v14 postgres
+ARG PG_VERSION
+COPY vendor/postgres-${PG_VERSION} postgres
 RUN cd postgres && \
     ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
@@ -135,6 +133,27 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
 
+#########################################################################################
+#
+# Layer "unit-pg-build"
+# compile unit extension
+#
+#########################################################################################
+FROM build-deps AS unit-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \
+    tar xvzf 7.7.tar.gz && \
+    cd postgresql-unit-7.7 && \
+    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
+    # We move the extension from '/usr/local/pgsql/' to '/usr/local/'  after it is build. So we need to adjust the path.
+    # This one-liner removes pgsql/ part of the path.
+    # NOTE: Other extensions that rely on MODULEDIR variable after building phase will need the same fix.
+    find /usr/local/pgsql/share/extension/ -name "unit*.sql" -print0 | xargs -0 sed -i "s|pgsql/||g" && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/unit.control
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -146,6 +165,7 @@ COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /h3/usr /
+COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -158,7 +178,7 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
 # Compile and run the Neon-specific `compute_ctl` binary
 #
 #########################################################################################
-FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
+FROM $REPOSITORY/$IMAGE:$TAG AS compute-tools
 USER nonroot
 # Copy entire project to get Cargo.* files with proper dependencies for the whole project
 COPY --chown=nonroot . .
diff --git a/Dockerfile.compute-node-v15 b/Dockerfile.compute-node-v15
deleted file mode 100644
index 8647ce2bf4..0000000000
--- a/Dockerfile.compute-node-v15
+++ /dev/null
@@ -1,220 +0,0 @@
-#
-# This file is identical to the Dockerfile.compute-node-v14 file
-# except for the version of Postgres that is built.
-#
-
-ARG TAG=pinned
-
-#########################################################################################
-#
-# Layer "build-deps"
-#
-#########################################################################################
-FROM debian:bullseye-slim AS build-deps
-RUN apt update &&  \
-    apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
-    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev
-
-#########################################################################################
-#
-# Layer "pg-build"
-# Build Postgres from the neon postgres repository.
-#
-#########################################################################################
-FROM build-deps AS pg-build
-COPY vendor/postgres-v15 postgres
-RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
-    # Install headers
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/include install && \
-    make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
-    # Enable some of contrib extensions
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control
-
-#########################################################################################
-#
-# Layer "postgis-build"
-# Build PostGIS from the upstream PostGIS mirror.
-#
-#########################################################################################
-FROM build-deps AS postgis-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-RUN apt update && \
-    apt install -y gdal-bin libgdal-dev libprotobuf-c-dev protobuf-c-compiler xsltproc
-
-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
-    tar xvzf postgis-3.3.1.tar.gz && \
-    cd postgis-3.3.1 && \
-    ./autogen.sh && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
-    ./configure && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    cd extensions/postgis && \
-    make clean && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_raster.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
-
-#########################################################################################
-#
-# Layer "plv8-build"
-# Build plv8
-#
-#########################################################################################
-FROM build-deps AS plv8-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
-
-# https://github.com/plv8/plv8/issues/475:
-#   v8 uses gold for linking and sets `--thread-count=4` which breaks
-#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
-# Install newer gold version manually as debian-testing binutils version updates
-# libc version, which in turn breaks other extension built against non-testing libc.
-RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
-    tar xvzf binutils-2.38.tar.gz && \
-    cd binutils-2.38 && \
-    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
-    cd ../bfd && ./configure && make bfdver.h && \
-    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
-    cp /usr/local/bin/ld.gold /usr/bin/gold
-
-# Sed is used to patch for https://github.com/plv8/plv8/issues/503
-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
-    tar xvzf v3.1.4.tar.gz && \
-    cd plv8-3.1.4 && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
-    sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
-    make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
-    rm -rf /plv8-* && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control
-
-#########################################################################################
-#
-# Layer "h3-pg-build"
-# Build h3_pg
-#
-#########################################################################################
-FROM build-deps AS h3-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-# packaged cmake is too old
-RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2-linux-x86_64.sh \
-      -q -O /tmp/cmake-install.sh \
-      && chmod u+x /tmp/cmake-install.sh \
-      && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
-      && rm /tmp/cmake-install.sh
-
-RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
-    tar xvzf h3.tgz  && \
-    cd h3-4.0.1 && \
-    mkdir build && \
-    cd build && \
-    cmake .. -DCMAKE_BUILD_TYPE=Release && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    DESTDIR=/h3 make install && \
-    cp -R /h3/usr / && \
-    rm -rf build
-
-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
-    tar xvzf h3-pg.tgz && \
-    cd h3-pg-4.0.1 && \
-    export PATH="/usr/local/pgsql/bin:$PATH" && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/h3_postgis.control
-
-#########################################################################################
-#
-# Layer "neon-pg-ext-build"
-# compile neon extensions
-#
-#########################################################################################
-FROM build-deps AS neon-pg-ext-build
-COPY --from=postgis-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=plv8-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=h3-pg-build /h3/usr /
-COPY pgxn/ pgxn/
-
-RUN make -j $(getconf _NPROCESSORS_ONLN) \
-        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
-        -C pgxn/neon \
-        -s install
-
-#########################################################################################
-#
-# Compile and run the Neon-specific `compute_ctl` binary
-#
-#########################################################################################
-FROM 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:$TAG AS compute-tools
-USER nonroot
-# Copy entire project to get Cargo.* files with proper dependencies for the whole project
-COPY --chown=nonroot . .
-RUN cd compute_tools && cargo build --locked --profile release-line-debug-size-lto
-
-#########################################################################################
-#
-# Clean up postgres folder before inclusion
-#
-#########################################################################################
-FROM neon-pg-ext-build AS postgres-cleanup-layer
-COPY --from=neon-pg-ext-build /usr/local/pgsql /usr/local/pgsql
-
-# Remove binaries from /bin/ that we won't use (or would manually copy & install otherwise)
-RUN cd /usr/local/pgsql/bin && rm ecpg raster2pgsql shp2pgsql pgtopo_export pgtopo_import pgsql2shp
-
-# Remove headers that we won't need anymore - we've completed installation of all extensions
-RUN rm -r /usr/local/pgsql/include
-
-# Remove static postgresql libraries - all compilation is finished, so we
-# can now remove these files - they must be included in other binaries by now
-# if they were to be used by other libraries.
-RUN rm /usr/local/pgsql/lib/lib*.a
-
-#########################################################################################
-#
-# Final layer
-# Put it all together into the final image
-#
-#########################################################################################
-FROM debian:bullseye-slim
-# Add user postgres
-RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
-    echo "postgres:test_console_pass" | chpasswd && \
-    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
-    chown -R postgres:postgres /var/db/postgres && \
-    chmod 0750 /var/db/postgres/compute && \
-    echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
-
-COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
-COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
-
-# Install:
-# libreadline8 for psql
-# libossp-uuid16 for extension ossp-uuid
-# libgeos, libgdal, libproj and libprotobuf-c1 for PostGIS
-RUN apt update &&  \
-    apt install --no-install-recommends -y \
-        libreadline8 \
-        libossp-uuid16 \
-        libgeos-c1v5 \
-        libgdal28 \
-        libproj19 \
-        libprotobuf-c1 \
-        gdb && \
-    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-USER postgres
-ENTRYPOINT ["/usr/local/bin/compute_ctl"]
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 4536604bdf..f8c3481f57 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -11,6 +11,7 @@ clap.workspace = true
 futures.workspace = true
 hyper = { workspace = true, features = ["full"] }
 notify.workspace = true
+opentelemetry.workspace = true
 postgres.workspace = true
 regex.workspace = true
 serde.workspace = true
@@ -19,7 +20,9 @@ tar.workspace = true
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tokio-postgres.workspace = true
 tracing.workspace = true
+tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
+tracing-utils.workspace = true
 url.workspace = true
 
 workspace_hack.workspace = true
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index e5ab8eb153..2c42662020 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -53,7 +53,7 @@ use compute_tools::spec::*;
 use url::Url;
 
 fn main() -> Result<()> {
-    init_logger(DEFAULT_LOG_LEVEL)?;
+    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
 
     let matches = cli().get_matches();
 
@@ -84,6 +84,29 @@ fn main() -> Result<()> {
         }
     };
 
+    // Extract OpenTelemetry context for the startup actions from the spec, and
+    // attach it to the current tracing context.
+    //
+    // This is used to propagate the context for the 'start_compute' operation
+    // from the neon control plane. This allows linking together the wider
+    // 'start_compute' operation that creates the compute container, with the
+    // startup actions here within the container.
+    //
+    // Switch to the startup context here, and exit it once the startup has
+    // completed and Postgres is up and running.
+    //
+    // NOTE: This is supposed to only cover the *startup* actions. Once
+    // postgres is configured and up-and-running, we exit this span. Any other
+    // actions that are performed on incoming HTTP requests, for example, are
+    // performed in separate spans.
+    let startup_context_guard = if let Some(ref carrier) = spec.startup_tracing_context {
+        use opentelemetry::propagation::TextMapPropagator;
+        use opentelemetry::sdk::propagation::TraceContextPropagator;
+        Some(TraceContextPropagator::new().extract(carrier).attach())
+    } else {
+        None
+    };
+
     let pageserver_connstr = spec
         .cluster
         .settings
@@ -140,6 +163,9 @@ fn main() -> Result<()> {
     // Wait for the child Postgres process forever. In this state Ctrl+C will
     // propagate to Postgres and it will be shut down as well.
     if let Some(mut pg) = pg {
+        // Startup is finished, exit the startup tracing span
+        drop(startup_context_guard);
+
         let ecode = pg
             .wait()
             .expect("failed to start waiting on Postgres process");
@@ -159,6 +185,10 @@ fn main() -> Result<()> {
         info!("shutting down");
     }
 
+    // Shutdown trace pipeline gracefully, so that it has a chance to send any
+    // pending traces before we exit.
+    tracing_utils::shutdown_tracing();
+
     exit(exit_code.unwrap_or(1))
 }
 
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index f2a49f332c..589a8e1434 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -3,16 +3,21 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;
 
+use crate::compute::ComputeNode;
 use anyhow::Result;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
 use serde_json;
 use tracing::{error, info};
-
-use crate::compute::ComputeNode;
+use tracing_utils::http::OtelName;
 
 // Service function to handle all available routes.
-async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body> {
+async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body> {
+    //
+    // NOTE: The URI path is currently included in traces. That's OK because
+    // it doesn't contain any variable parts or sensitive information. But
+    // please keep that in mind if you change the routing here.
+    //
     match (req.method(), req.uri().path()) {
         // Serialized compute state.
         (&Method::GET, "/status") => {
@@ -30,7 +35,7 @@ async fn routes(req: Request<Body>, compute: Arc<ComputeNode>) -> Response<Body>
 
         (&Method::POST, "/check_writability") => {
             info!("serving /check_writability POST request");
-            let res = crate::checker::check_writability(&compute).await;
+            let res = crate::checker::check_writability(compute).await;
             match res {
                 Ok(_) => Response::new(Body::from("true")),
                 Err(e) => Response::new(Body::from(e.to_string())),
@@ -56,7 +61,19 @@ async fn serve(state: Arc<ComputeNode>) {
         async move {
             Ok::<_, Infallible>(service_fn(move |req: Request<Body>| {
                 let state = state.clone();
-                async move { Ok::<_, Infallible>(routes(req, state).await) }
+                async move {
+                    Ok::<_, Infallible>(
+                        // NOTE: We include the URI path in the string. It
+                        // doesn't contain any variable parts or sensitive
+                        // information in this API.
+                        tracing_utils::http::tracing_handler(
+                            req,
+                            |req| routes(req, &state),
+                            OtelName::UriPath,
+                        )
+                        .await,
+                    )
+                }
             }))
         }
     });
diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs
index 57e5496e86..1b5cf647b0 100644
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -1,21 +1,37 @@
-use anyhow::Result;
+use tracing_opentelemetry::OpenTelemetryLayer;
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::prelude::*;
 
-/// Initialize `env_logger` using either `default_level` or
+/// Initialize logging to stderr, and OpenTelemetry tracing and exporter.
+///
+/// Logging is configured using either `default_log_level` or
 /// `RUST_LOG` environment variable as default log level.
-pub fn init_logger(default_level: &str) -> Result<()> {
+///
+/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up
+/// configuration from environment variables. For example, to change the destination,
+/// set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`. See
+/// `tracing-utils` package description.
+///
+pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {
+    // Initialize Logging
     let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
-        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_level));
+        .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new(default_log_level));
 
     let fmt_layer = tracing_subscriber::fmt::layer()
         .with_target(false)
         .with_writer(std::io::stderr);
 
+    // Initialize OpenTelemetry
+    let otlp_layer =
+        tracing_utils::init_tracing_without_runtime("compute_ctl").map(OpenTelemetryLayer::new);
+
+    // Put it all together
     tracing_subscriber::registry()
         .with(env_filter)
+        .with(otlp_layer)
         .with(fmt_layer)
         .init();
+    tracing::info!("logging and tracing started");
 
     Ok(())
 }
diff --git a/compute_tools/src/params.rs b/compute_tools/src/params.rs
index 925a2f8ef3..0ce01ff478 100644
--- a/compute_tools/src/params.rs
+++ b/compute_tools/src/params.rs
@@ -1,3 +1,9 @@
 pub const DEFAULT_LOG_LEVEL: &str = "info";
-pub const DEFAULT_CONNSTRING: &str = "host=localhost user=postgres";
+// From Postgres docs:
+//   To ease transition from the md5 method to the newer SCRAM method, if md5 is specified
+//   as a method in pg_hba.conf but the user's password on the server is encrypted for SCRAM
+//   (see below), then SCRAM-based authentication will automatically be chosen instead.
+//   https://www.postgresql.org/docs/15/auth-password.html
+//
+// So it's safe to set md5 here, as `control-plane` anyway uses SCRAM for all roles.
 pub const PG_HBA_ALL_MD5: &str = "host\tall\t\tall\t\t0.0.0.0/0\t\tmd5";
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 97cd623052..bbd0ec21ed 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -1,3 +1,4 @@
+use std::collections::HashMap;
 use std::path::Path;
 use std::str::FromStr;
 
@@ -22,6 +23,8 @@ pub struct ComputeSpec {
     /// Expected cluster state at the end of transition process.
     pub cluster: Cluster,
     pub delta_operations: Option<Vec<DeltaOp>>,
+
+    pub startup_tracing_context: Option<HashMap<String, String>>,
 }
 
 /// Cluster state seen from the perspective of the external tools
@@ -152,8 +155,20 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
             {
                 RoleAction::Update
             } else if let Some(pg_pwd) = &r.encrypted_password {
-                // Check whether password changed or not (trim 'md5:' prefix first)
-                if pg_pwd[3..] != *role.encrypted_password.as_ref().unwrap() {
+                // Check whether password changed or not (trim 'md5' prefix first if any)
+                //
+                // This is a backward compatibility hack, which comes from the times when we were using
+                // md5 for everyone and hashes were stored in the console db without md5 prefix. So when
+                // role comes from the control-plane (json spec) `Role.encrypted_password` doesn't have md5 prefix,
+                // but when role comes from Postgres (`get_existing_roles` / `existing_roles`) it has this prefix.
+                // Here is the only place so far where we compare hashes, so it seems to be the best candidate
+                // to place this compatibility layer.
+                let pg_pwd = if let Some(stripped) = pg_pwd.strip_prefix("md5") {
+                    stripped
+                } else {
+                    pg_pwd
+                };
+                if pg_pwd != *role.encrypted_password.as_ref().unwrap() {
                     RoleAction::Update
                 } else {
                     RoleAction::None
@@ -372,13 +387,13 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                     name.pg_quote(),
                     db.owner.pg_quote()
                 );
-                let _ = info_span!("executing", query).entered();
+                let _guard = info_span!("executing", query).entered();
                 client.execute(query.as_str(), &[])?;
             }
             DatabaseAction::Create => {
                 let mut query: String = format!("CREATE DATABASE {} ", name.pg_quote());
                 query.push_str(&db.to_pg_options());
-                let _ = info_span!("executing", query).entered();
+                let _guard = info_span!("executing", query).entered();
                 client.execute(query.as_str(), &[])?;
             }
         };
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 880ab0e83c..07d220195b 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -8,6 +8,7 @@ pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::{core, default_registry, proto};
 pub use prometheus::{exponential_buckets, linear_buckets};
+pub use prometheus::{register_counter_vec, Counter, CounterVec};
 pub use prometheus::{register_gauge, Gauge};
 pub use prometheus::{register_gauge_vec, GaugeVec};
 pub use prometheus::{register_histogram, Histogram};
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index b5027cb331..0d7aa2db55 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -29,6 +29,14 @@ pub enum TenantState {
     Broken,
 }
 
+pub mod state {
+    pub const LOADING: &str = "loading";
+    pub const ATTACHING: &str = "attaching";
+    pub const ACTIVE: &str = "active";
+    pub const STOPPING: &str = "stopping";
+    pub const BROKEN: &str = "broken";
+}
+
 impl TenantState {
     pub fn has_in_progress_downloads(&self) -> bool {
         match self {
@@ -39,23 +47,32 @@ impl TenantState {
             Self::Broken => false,
         }
     }
+
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            TenantState::Loading => state::LOADING,
+            TenantState::Attaching => state::ATTACHING,
+            TenantState::Active => state::ACTIVE,
+            TenantState::Stopping => state::STOPPING,
+            TenantState::Broken => state::BROKEN,
+        }
+    }
 }
 
 /// A state of a timeline in pageserver's memory.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TimelineState {
-    /// Timeline is fully operational. If the containing Tenant is Active, the timeline's
-    /// background jobs are running otherwise they will be launched when the tenant is activated.
+    /// The timeline is recognized by the pageserver but is not yet operational.
+    /// In particular, the walreceiver connection loop is not running for this timeline.
+    /// It will eventually transition to state Active or Broken.
+    Loading,
+    /// The timeline is fully operational.
+    /// It can be queried, and the walreceiver connection loop is running.
     Active,
-    /// A timeline is recognized by pageserver, but not yet ready to operate.
-    /// The status indicates, that the timeline could eventually go back to Active automatically:
-    /// for example, if the owning tenant goes back to Active again.
-    Suspended,
-    /// A timeline is recognized by pageserver, but not yet ready to operate and not allowed to
-    /// automatically become Active after certain events: only a management call can change this status.
+    /// The timeline was previously Loading or Active but is shutting down.
+    /// It cannot transition back into any other state.
     Stopping,
-    /// A timeline is recognized by the pageserver, but can no longer be used for
-    /// any operations, because it failed to be activated.
+    /// The timeline is broken and not operational (previous states: Loading or Active).
     Broken,
 }
 
diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml
new file mode 100644
index 0000000000..8c3d3f9063
--- /dev/null
+++ b/libs/tracing-utils/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "tracing-utils"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+hyper.workspace = true
+opentelemetry = { workspace = true, features=["rt-tokio"] }
+opentelemetry-otlp = { workspace = true, default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions.workspace = true
+reqwest = { workspace = true, default-features = false, features = ["rustls-tls"] }
+tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
+tracing.workspace = true
+tracing-opentelemetry.workspace = true
+tracing-subscriber.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/tracing-utils/src/http.rs b/libs/tracing-utils/src/http.rs
new file mode 100644
index 0000000000..3f80f49de1
--- /dev/null
+++ b/libs/tracing-utils/src/http.rs
@@ -0,0 +1,96 @@
+//! Tracing wrapper for Hyper HTTP server
+
+use hyper::HeaderMap;
+use hyper::{Body, Request, Response};
+use std::future::Future;
+use tracing::Instrument;
+use tracing_opentelemetry::OpenTelemetrySpanExt;
+
+/// Configuration option for what to use as the "otel.name" field in the traces.
+pub enum OtelName<'a> {
+    /// Use a constant string
+    Constant(&'a str),
+
+    /// Use the path from the request.
+    ///
+    /// That's very useful information, but is not appropriate if the
+    /// path contains parameters that differ on ever request, or worse,
+    /// sensitive information like usernames or email addresses.
+    ///
+    /// See <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/semantic_conventions/http.md#name>
+    UriPath,
+}
+
+/// Handle an incoming HTTP request using the given handler function,
+/// with OpenTelemetry tracing.
+///
+/// This runs 'handler' on the request in a new span, with fields filled in
+/// from the request. Notably, if the request contains tracing information,
+/// it is propagated to the span, so that this request is traced as part of
+/// the same trace.
+///
+/// XXX: Usually, this is handled by existing libraries, or built
+/// directly into HTTP servers. However, I couldn't find one for Hyper,
+/// so I had to write our own. OpenTelemetry website has a registry of
+/// instrumentation libraries at:
+/// https://opentelemetry.io/registry/?language=rust&component=instrumentation
+/// If a Hyper crate appears, consider switching to that.
+pub async fn tracing_handler<F, R>(
+    req: Request<Body>,
+    handler: F,
+    otel_name: OtelName<'_>,
+) -> Response<Body>
+where
+    F: Fn(Request<Body>) -> R,
+    R: Future<Output = Response<Body>>,
+{
+    // Create a tracing span, with context propagated from the incoming
+    // request if any.
+    //
+    // See list of standard fields defined for HTTP requests at
+    // https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/semantic_conventions/http.md
+    // We only fill in a few of the most useful ones here.
+    let otel_name = match otel_name {
+        OtelName::Constant(s) => s,
+        OtelName::UriPath => req.uri().path(),
+    };
+
+    let span = tracing::info_span!(
+        "http request",
+        otel.name= %otel_name,
+        http.method = %req.method(),
+        http.status_code = tracing::field::Empty,
+    );
+    let parent_ctx = extract_remote_context(req.headers());
+    span.set_parent(parent_ctx);
+
+    // Handle the request within the span
+    let response = handler(req).instrument(span.clone()).await;
+
+    // Fill in the fields from the response code
+    let status = response.status();
+    span.record("http.status_code", status.as_str());
+    span.record(
+        "otel.status_code",
+        if status.is_success() { "OK" } else { "ERROR" },
+    );
+
+    response
+}
+
+// Extract remote tracing context from the HTTP headers
+fn extract_remote_context(headers: &HeaderMap) -> opentelemetry::Context {
+    struct HeaderExtractor<'a>(&'a HeaderMap);
+
+    impl<'a> opentelemetry::propagation::Extractor for HeaderExtractor<'a> {
+        fn get(&self, key: &str) -> Option<&str> {
+            self.0.get(key).and_then(|value| value.to_str().ok())
+        }
+
+        fn keys(&self) -> Vec<&str> {
+            self.0.keys().map(|value| value.as_str()).collect()
+        }
+    }
+    let extractor = HeaderExtractor(headers);
+    opentelemetry::global::get_text_map_propagator(|propagator| propagator.extract(&extractor))
+}
diff --git a/libs/tracing-utils/src/lib.rs b/libs/tracing-utils/src/lib.rs
new file mode 100644
index 0000000000..de0e2ad799
--- /dev/null
+++ b/libs/tracing-utils/src/lib.rs
@@ -0,0 +1,168 @@
+//! Helper functions to set up OpenTelemetry tracing.
+//!
+//! This comes in two variants, depending on whether you have a Tokio runtime available.
+//! If you do, call `init_tracing()`. It sets up the trace processor and exporter to use
+//! the current tokio runtime. If you don't have a runtime available, or you don't want
+//! to share the runtime with the tracing tasks, call `init_tracing_without_runtime()`
+//! instead. It sets up a dedicated single-threaded Tokio runtime for the tracing tasks.
+//!
+//! Example:
+//!
+//! ```rust,no_run
+//! use tracing_subscriber::prelude::*;
+//! use tracing_opentelemetry::OpenTelemetryLayer;
+//!
+//! #[tokio::main]
+//! async fn main() {
+//!     // Set up logging to stderr
+//!     let env_filter = tracing_subscriber::EnvFilter::try_from_default_env()
+//!         .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info"));
+//!     let fmt_layer = tracing_subscriber::fmt::layer()
+//!         .with_target(false)
+//!         .with_writer(std::io::stderr);
+//!
+//!     // Initialize OpenTelemetry. Exports tracing spans as OpenTelemetry traces
+//!     let otlp_layer = tracing_utils::init_tracing("my_application").await.map(OpenTelemetryLayer::new);
+//!
+//!     // Put it all together
+//!     tracing_subscriber::registry()
+//!         .with(env_filter)
+//!         .with(otlp_layer)
+//!         .with(fmt_layer)
+//!         .init();
+//! }
+//! ```
+
+use opentelemetry::sdk::Resource;
+use opentelemetry::KeyValue;
+use opentelemetry_otlp::WithExportConfig;
+use opentelemetry_otlp::{OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_TRACES_ENDPOINT};
+
+pub use tracing_opentelemetry::OpenTelemetryLayer;
+
+pub mod http;
+
+/// Set up OpenTelemetry exporter, using configuration from environment variables.
+///
+/// `service_name` is set as the OpenTelemetry 'service.name' resource (see
+/// <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/resource/semantic_conventions/README.md#service>)
+///
+/// We try to follow the conventions for the environment variables specified in
+/// <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables/>
+///
+/// However, we only support a subset of those options:
+///
+/// - OTEL_SDK_DISABLED is supported. The default is "false", meaning tracing
+///   is enabled by default. Set it to "true" to disable.
+///
+/// - We use the OTLP exporter, with HTTP protocol. Most of the OTEL_EXPORTER_OTLP_*
+///   settings specified in
+///   <https://opentelemetry.io/docs/reference/specification/protocol/exporter/>
+///   are supported, as they are handled by the `opentelemetry-otlp` crate.
+///   Settings related to other exporters have no effect.
+///
+/// - Some other settings are supported by the `opentelemetry` crate.
+///
+/// If you need some other setting, please test if it works first. And perhaps
+/// add a comment in the list above to save the effort of testing for the next
+/// person.
+///
+/// This doesn't block, but is marked as 'async' to hint that this must be called in
+/// asynchronous execution context.
+pub async fn init_tracing(service_name: &str) -> Option<opentelemetry::sdk::trace::Tracer> {
+    if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
+        return None;
+    };
+    Some(init_tracing_internal(service_name.to_string()))
+}
+
+/// Like `init_tracing`, but creates a separate tokio Runtime for the tracing
+/// tasks.
+pub fn init_tracing_without_runtime(
+    service_name: &str,
+) -> Option<opentelemetry::sdk::trace::Tracer> {
+    if std::env::var("OTEL_SDK_DISABLED") == Ok("true".to_string()) {
+        return None;
+    };
+
+    // The opentelemetry batch processor and the OTLP exporter needs a Tokio
+    // runtime. Create a dedicated runtime for them. One thread should be
+    // enough.
+    //
+    // (Alternatively, instead of batching, we could use the "simple
+    // processor", which doesn't need Tokio, and use "reqwest-blocking"
+    // feature for the OTLP exporter, which also doesn't need Tokio.  However,
+    // batching is considered best practice, and also I have the feeling that
+    // the non-Tokio codepaths in the opentelemetry crate are less used and
+    // might be more buggy, so better to stay on the well-beaten path.)
+    //
+    // We leak the runtime so that it keeps running after we exit the
+    // function.
+    let runtime = Box::leak(Box::new(
+        tokio::runtime::Builder::new_multi_thread()
+            .enable_all()
+            .thread_name("otlp runtime thread")
+            .worker_threads(1)
+            .build()
+            .unwrap(),
+    ));
+    let _guard = runtime.enter();
+
+    Some(init_tracing_internal(service_name.to_string()))
+}
+
+fn init_tracing_internal(service_name: String) -> opentelemetry::sdk::trace::Tracer {
+    // Set up exporter from the OTEL_EXPORTER_* environment variables
+    let mut exporter = opentelemetry_otlp::new_exporter().http().with_env();
+
+    // XXX opentelemetry-otlp v0.18.0 has a bug in how it uses the
+    // OTEL_EXPORTER_OTLP_ENDPOINT env variable. According to the
+    // OpenTelemetry spec at
+    // <https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/exporter.md#endpoint-urls-for-otlphttp>,
+    // the full exporter URL is formed by appending "/v1/traces" to the value
+    // of OTEL_EXPORTER_OTLP_ENDPOINT. However, opentelemetry-otlp only does
+    // that with the grpc-tonic exporter. Other exporters, like the HTTP
+    // exporter, use the URL from OTEL_EXPORTER_OTLP_ENDPOINT as is, without
+    // appending "/v1/traces".
+    //
+    // See https://github.com/open-telemetry/opentelemetry-rust/pull/950
+    //
+    // Work around that by checking OTEL_EXPORTER_OTLP_ENDPOINT, and setting
+    // the endpoint url with the "/v1/traces" path ourselves. If the bug is
+    // fixed in a later version, we can remove this code. But if we don't
+    // remember to remove this, it won't do any harm either, as the crate will
+    // just ignore the OTEL_EXPORTER_OTLP_ENDPOINT setting when the endpoint
+    // is set directly with `with_endpoint`.
+    if std::env::var(OTEL_EXPORTER_OTLP_TRACES_ENDPOINT).is_err() {
+        if let Ok(mut endpoint) = std::env::var(OTEL_EXPORTER_OTLP_ENDPOINT) {
+            if !endpoint.ends_with('/') {
+                endpoint.push('/');
+            }
+            endpoint.push_str("v1/traces");
+            exporter = exporter.with_endpoint(endpoint);
+        }
+    }
+
+    // Propagate trace information in the standard W3C TraceContext format.
+    opentelemetry::global::set_text_map_propagator(
+        opentelemetry::sdk::propagation::TraceContextPropagator::new(),
+    );
+
+    opentelemetry_otlp::new_pipeline()
+        .tracing()
+        .with_exporter(exporter)
+        .with_trace_config(
+            opentelemetry::sdk::trace::config().with_resource(Resource::new(vec![KeyValue::new(
+                opentelemetry_semantic_conventions::resource::SERVICE_NAME,
+                service_name,
+            )])),
+        )
+        .install_batch(opentelemetry::runtime::Tokio)
+        .expect("could not initialize opentelemetry exporter")
+}
+
+// Shutdown trace pipeline gracefully, so that it has a chance to send any
+// pending traces before we exit.
+pub fn shutdown_tracing() {
+    opentelemetry::global::shutdown_tracer_provider();
+}
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 020e4d9dd7..1f6c96bdbe 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,6 +5,7 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
+atty.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index b0ecb746d9..1ba0422993 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -1,6 +1,7 @@
 use hyper::{header, Body, Response, StatusCode};
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
+use tracing::error;
 
 #[derive(Debug, Error)]
 pub enum ApiError {
@@ -76,8 +77,16 @@ impl HttpErrorBody {
 }
 
 pub async fn handler(err: routerify::RouteError) -> Response<Body> {
-    tracing::error!("Error processing HTTP request: {:?}", err);
-    err.downcast::<ApiError>()
-        .expect("handler should always return api error")
-        .into_response()
+    let api_error = err
+        .downcast::<ApiError>()
+        .expect("handler should always return api error");
+
+    // Print a stack trace for Internal Server errors
+    if let ApiError::InternalServerError(_) = api_error.as_ref() {
+        error!("Error processing HTTP request: {api_error:?}");
+    } else {
+        error!("Error processing HTTP request: {api_error:#}");
+    }
+
+    api_error.into_response()
 }
diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 82c9267f4a..02684d3d16 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -34,7 +34,7 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
     let base_logger = tracing_subscriber::fmt()
         .with_env_filter(env_filter)
         .with_target(false)
-        .with_ansi(false)
+        .with_ansi(atty::is(atty::Stream::Stdout))
         .with_writer(std::io::stdout);
 
     match log_format {
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index cb9e4478bf..66c25e8576 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -11,7 +11,6 @@ default = []
 testing = ["fail/failpoints"]
 
 [dependencies]
-amplify_num.workspace = true
 anyhow.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
@@ -41,7 +40,6 @@ postgres-protocol.workspace = true
 postgres-types.workspace = true
 rand.workspace = true
 regex.workspace = true
-rstar.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
@@ -68,6 +66,7 @@ tenant_size_model.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
 reqwest.workspace = true
+rpds.workspace = true
 
 [dev-dependencies]
 criterion.workspace = true
diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 6a01fdfc6f..e18c00da96 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,13 +1,12 @@
-use anyhow::Result;
+use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
-use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, ValueReconstructState};
-use pageserver::tenant::storage_layer::{Layer, ValueReconstructResult};
+use pageserver::tenant::storage_layer::Layer;
+use pageserver::tenant::storage_layer::{DeltaFileName, ImageFileName, LayerDescriptor};
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
 use std::io::{BufRead, BufReader};
-use std::ops::Range;
 use std::path::PathBuf;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -17,102 +16,35 @@ use utils::lsn::Lsn;
 
 use criterion::{criterion_group, criterion_main, Criterion};
 
-struct DummyDelta {
-    key_range: Range<Key>,
-    lsn_range: Range<Lsn>,
-}
-
-impl Layer for DummyDelta {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.lsn_range.clone()
-    }
-    fn get_value_reconstruct_data(
-        &self,
-        _key: Key,
-        _lsn_range: Range<Lsn>,
-        _reconstruct_data: &mut ValueReconstructState,
-    ) -> Result<ValueReconstructResult> {
-        panic!()
-    }
-
-    fn is_incremental(&self) -> bool {
-        true
-    }
-
-    fn dump(&self, _verbose: bool) -> Result<()> {
-        unimplemented!()
-    }
-
-    fn short_id(&self) -> String {
-        unimplemented!()
-    }
-}
-
-struct DummyImage {
-    key_range: Range<Key>,
-    lsn: Lsn,
-}
-
-impl Layer for DummyImage {
-    fn get_key_range(&self) -> Range<Key> {
-        self.key_range.clone()
-    }
-
-    fn get_lsn_range(&self) -> Range<Lsn> {
-        // End-bound is exclusive
-        self.lsn..(self.lsn + 1)
-    }
-
-    fn get_value_reconstruct_data(
-        &self,
-        _key: Key,
-        _lsn_range: Range<Lsn>,
-        _reconstruct_data: &mut ValueReconstructState,
-    ) -> Result<ValueReconstructResult> {
-        panic!()
-    }
-
-    fn is_incremental(&self) -> bool {
-        false
-    }
-
-    fn dump(&self, _verbose: bool) -> Result<()> {
-        unimplemented!()
-    }
-
-    fn short_id(&self) -> String {
-        unimplemented!()
-    }
-}
-
-fn build_layer_map(filename_dump: PathBuf) -> LayerMap<dyn Layer> {
-    let mut layer_map = LayerMap::<dyn Layer>::default();
+fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
+    let mut layer_map = LayerMap::<LayerDescriptor>::default();
 
     let mut min_lsn = Lsn(u64::MAX);
     let mut max_lsn = Lsn(0);
 
     let filenames = BufReader::new(File::open(filename_dump).unwrap()).lines();
 
+    let mut updates = layer_map.batch_update();
     for fname in filenames {
         let fname = &fname.unwrap();
         if let Some(imgfilename) = ImageFileName::parse_str(fname) {
-            let layer = DummyImage {
-                key_range: imgfilename.key_range,
-                lsn: imgfilename.lsn,
+            let layer = LayerDescriptor {
+                key: imgfilename.key_range,
+                lsn: imgfilename.lsn..(imgfilename.lsn + 1),
+                is_incremental: false,
+                short_id: fname.to_string(),
             };
-            layer_map.insert_historic(Arc::new(layer));
+            updates.insert_historic(Arc::new(layer));
             min_lsn = min(min_lsn, imgfilename.lsn);
             max_lsn = max(max_lsn, imgfilename.lsn);
         } else if let Some(deltafilename) = DeltaFileName::parse_str(fname) {
-            let layer = DummyDelta {
-                key_range: deltafilename.key_range,
-                lsn_range: deltafilename.lsn_range.clone(),
+            let layer = LayerDescriptor {
+                key: deltafilename.key_range.clone(),
+                lsn: deltafilename.lsn_range.clone(),
+                is_incremental: true,
+                short_id: fname.to_string(),
             };
-            layer_map.insert_historic(Arc::new(layer));
+            updates.insert_historic(Arc::new(layer));
             min_lsn = min(min_lsn, deltafilename.lsn_range.start);
             max_lsn = max(max_lsn, deltafilename.lsn_range.end);
         } else {
@@ -122,11 +54,12 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<dyn Layer> {
 
     println!("min: {min_lsn}, max: {max_lsn}");
 
+    updates.flush();
     layer_map
 }
 
 /// Construct a layer map query pattern for benchmarks
-fn uniform_query_pattern(layer_map: &LayerMap<dyn Layer>) -> Vec<(Key, Lsn)> {
+fn uniform_query_pattern(layer_map: &LayerMap<LayerDescriptor>) -> Vec<(Key, Lsn)> {
     // For each image layer we query one of the pages contained, at LSN right
     // before the image layer was created. This gives us a somewhat uniform
     // coverage of both the lsn and key space because image layers have
@@ -150,6 +83,41 @@ fn uniform_query_pattern(layer_map: &LayerMap<dyn Layer>) -> Vec<(Key, Lsn)> {
         .collect()
 }
 
+// Construct a partitioning for testing get_difficulty map when we
+// don't have an exact result of `collect_keyspace` to work with.
+fn uniform_key_partitioning(layer_map: &LayerMap<LayerDescriptor>, _lsn: Lsn) -> KeyPartitioning {
+    let mut parts = Vec::new();
+
+    // We add a partition boundary at the start of each image layer,
+    // no matter what lsn range it covers. This is just the easiest
+    // thing to do. A better thing to do would be to get a real
+    // partitioning from some database. Even better, remove the need
+    // for key partitions by deciding where to create image layers
+    // directly based on a coverage-based difficulty map.
+    let mut keys: Vec<_> = layer_map
+        .iter_historic_layers()
+        .filter_map(|l| {
+            if l.is_incremental() {
+                None
+            } else {
+                let kr = l.get_key_range();
+                Some(kr.start.next())
+            }
+        })
+        .collect();
+    keys.sort();
+
+    let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap();
+    for key in keys {
+        parts.push(KeySpace {
+            ranges: vec![current_key..key],
+        });
+        current_key = key;
+    }
+
+    KeyPartitioning { parts }
+}
+
 // Benchmark using metadata extracted from our performance test environment, from
 // a project where we have run pgbench many timmes. The pgbench database was initialized
 // between each test run.
@@ -183,24 +151,68 @@ fn bench_from_captest_env(c: &mut Criterion) {
 // Benchmark using metadata extracted from a real project that was taknig
 // too long processing layer map queries.
 fn bench_from_real_project(c: &mut Criterion) {
-    // TODO consider compressing this file
+    // Init layer map
+    let now = Instant::now();
     let layer_map = build_layer_map(PathBuf::from("benches/odd-brook-layernames.txt"));
+    println!("Finished layer map init in {:?}", now.elapsed());
+
+    // Choose uniformly distributed queries
     let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
 
-    // Test with uniform query pattern
-    c.bench_function("real_map_uniform_queries", |b| {
+    // Choose inputs for get_difficulty_map
+    let latest_lsn = layer_map
+        .iter_historic_layers()
+        .map(|l| l.get_lsn_range().end)
+        .max()
+        .unwrap();
+    let partitioning = uniform_key_partitioning(&layer_map, latest_lsn);
+
+    // Check correctness of get_difficulty_map
+    // TODO put this in a dedicated test outside of this mod
+    {
+        println!("running correctness check");
+
+        let now = Instant::now();
+        let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning);
+        assert!(result_bruteforce.len() == partitioning.parts.len());
+        println!("Finished bruteforce in {:?}", now.elapsed());
+
+        let now = Instant::now();
+        let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None);
+        assert!(result_fast.len() == partitioning.parts.len());
+        println!("Finished fast in {:?}", now.elapsed());
+
+        // Assert results are equal. Manually iterate for easier debugging.
+        let zip = std::iter::zip(
+            &partitioning.parts,
+            std::iter::zip(result_bruteforce, result_fast),
+        );
+        for (_part, (bruteforce, fast)) in zip {
+            assert_eq!(bruteforce, fast);
+        }
+
+        println!("No issues found");
+    }
+
+    // Define and name the benchmark function
+    let mut group = c.benchmark_group("real_map");
+    group.bench_function("uniform_queries", |b| {
         b.iter(|| {
             for q in queries.clone().into_iter() {
                 layer_map.search(q.0, q.1);
             }
         });
     });
+    group.bench_function("get_difficulty_map", |b| {
+        b.iter(|| {
+            layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3));
+        });
+    });
+    group.finish();
 }
 
 // Benchmark using synthetic data. Arrange image layers on stacked diagonal lines.
 fn bench_sequential(c: &mut Criterion) {
-    let mut layer_map: LayerMap<dyn Layer> = LayerMap::default();
-
     // Init layer map. Create 100_000 layers arranged in 1000 diagonal lines.
     //
     // TODO This code is pretty slow and runs even if we're only running other
@@ -208,39 +220,39 @@ fn bench_sequential(c: &mut Criterion) {
     //      Putting it inside the `bench_function` closure is not a solution
     //      because then it runs multiple times during warmup.
     let now = Instant::now();
+    let mut layer_map = LayerMap::default();
+    let mut updates = layer_map.batch_update();
     for i in 0..100_000 {
-        // TODO try inserting a super-wide layer in between every 10 to reflect
-        //      what often happens with L1 layers that include non-rel changes.
-        //      Maybe do that as a separate test.
         let i32 = (i as u32) % 100;
         let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
-        let layer = DummyImage {
-            key_range: zero.add(10 * i32)..zero.add(10 * i32 + 1),
-            lsn: Lsn(10 * i),
+        let layer = LayerDescriptor {
+            key: zero.add(10 * i32)..zero.add(10 * i32 + 1),
+            lsn: Lsn(i)..Lsn(i + 1),
+            is_incremental: false,
+            short_id: format!("Layer {}", i),
         };
-        layer_map.insert_historic(Arc::new(layer));
+        updates.insert_historic(Arc::new(layer));
     }
-
-    // Manually measure runtime without criterion because criterion
-    // has a minimum sample size of 10 and I don't want to run it 10 times.
-    println!("Finished init in {:?}", now.elapsed());
+    updates.flush();
+    println!("Finished layer map init in {:?}", now.elapsed());
 
     // Choose 100 uniformly random queries
     let rng = &mut StdRng::seed_from_u64(1);
     let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map)
-        .choose_multiple(rng, 1)
+        .choose_multiple(rng, 100)
         .copied()
         .collect();
 
     // Define and name the benchmark function
-    c.bench_function("sequential_uniform_queries", |b| {
-        // Run the search queries
+    let mut group = c.benchmark_group("sequential");
+    group.bench_function("uniform_queries", |b| {
         b.iter(|| {
             for q in queries.clone().into_iter() {
                 layer_map.search(q.0, q.1);
             }
         });
     });
+    group.finish();
 }
 
 criterion_group!(group_1, bench_from_captest_env);
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index f1d92ac36b..06d4853274 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -27,6 +27,7 @@ use tracing::*;
 ///
 use tokio_tar::{Builder, EntryType, Header};
 
+use crate::context::RequestContext;
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};
 
@@ -52,6 +53,7 @@ pub async fn send_basebackup_tarball<'a, W>(
     req_lsn: Option<Lsn>,
     prev_lsn: Option<Lsn>,
     full_backup: bool,
+    ctx: &'a RequestContext,
 ) -> anyhow::Result<()>
 where
     W: AsyncWrite + Send + Sync + Unpin,
@@ -110,6 +112,7 @@ where
         lsn: backup_lsn,
         prev_record_lsn: prev_lsn,
         full_backup,
+        ctx,
     };
     basebackup
         .send_tarball()
@@ -129,6 +132,7 @@ where
     lsn: Lsn,
     prev_record_lsn: Lsn,
     full_backup: bool,
+    ctx: &'a RequestContext,
 }
 
 impl<'a, W> Basebackup<'a, W>
@@ -171,23 +175,37 @@ where
             SlruKind::MultiXactOffsets,
             SlruKind::MultiXactMembers,
         ] {
-            for segno in self.timeline.list_slru_segments(kind, self.lsn).await? {
+            for segno in self
+                .timeline
+                .list_slru_segments(kind, self.lsn, self.ctx)
+                .await?
+            {
                 self.add_slru_segment(kind, segno).await?;
             }
         }
 
         // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in self.timeline.list_dbdirs(self.lsn).await? {
+        for ((spcnode, dbnode), has_relmap_file) in
+            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
+        {
             self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
 
             // Gather and send relational files in each database if full backup is requested.
             if self.full_backup {
-                for rel in self.timeline.list_rels(spcnode, dbnode, self.lsn).await? {
+                for rel in self
+                    .timeline
+                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
+                    .await?
+                {
                     self.add_rel(rel).await?;
                 }
             }
         }
-        for xid in self.timeline.list_twophase_files(self.lsn).await? {
+        for xid in self
+            .timeline
+            .list_twophase_files(self.lsn, self.ctx)
+            .await?
+        {
             self.add_twophase_file(xid).await?;
         }
 
@@ -203,7 +221,10 @@ where
     }
 
     async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
-        let nblocks = self.timeline.get_rel_size(tag, self.lsn, false).await?;
+        let nblocks = self
+            .timeline
+            .get_rel_size(tag, self.lsn, false, self.ctx)
+            .await?;
 
         // If the relation is empty, create an empty file
         if nblocks == 0 {
@@ -223,7 +244,7 @@ where
             for blknum in startblk..endblk {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false)
+                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false, self.ctx)
                     .await?;
                 segment_data.extend_from_slice(&img[..]);
             }
@@ -245,14 +266,14 @@ where
     async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
         let nblocks = self
             .timeline
-            .get_slru_segment_size(slru, segno, self.lsn)
+            .get_slru_segment_size(slru, segno, self.lsn, self.ctx)
             .await?;
 
         let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
         for blknum in 0..nblocks {
             let img = self
                 .timeline
-                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn)
+                .get_slru_page_at_lsn(slru, segno, blknum, self.lsn, self.ctx)
                 .await?;
 
             if slru == SlruKind::Clog {
@@ -287,7 +308,7 @@ where
         let relmap_img = if has_relmap_file {
             let img = self
                 .timeline
-                .get_relmap_file(spcnode, dbnode, self.lsn)
+                .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
                 .await?;
             ensure!(img.len() == 512);
             Some(img)
@@ -323,7 +344,7 @@ where
             if !has_relmap_file
                 && self
                     .timeline
-                    .list_rels(spcnode, dbnode, self.lsn)
+                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
                     .await?
                     .is_empty()
             {
@@ -356,7 +377,10 @@ where
     // Extract twophase state files
     //
     async fn add_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
-        let img = self.timeline.get_twophase_file(xid, self.lsn).await?;
+        let img = self
+            .timeline
+            .get_twophase_file(xid, self.lsn, self.ctx)
+            .await?;
 
         let mut buf = BytesMut::new();
         buf.extend_from_slice(&img[..]);
@@ -394,12 +418,12 @@ where
 
         let checkpoint_bytes = self
             .timeline
-            .get_checkpoint(self.lsn)
+            .get_checkpoint(self.lsn, self.ctx)
             .await
             .context("failed to get checkpoint bytes")?;
         let pg_control_bytes = self
             .timeline
-            .get_control_file(self.lsn)
+            .get_control_file(self.lsn, self.ctx)
             .await
             .context("failed get control bytes")?;
 
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 5de6e4def5..f2cd93bd3a 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -13,6 +13,7 @@ use tracing::*;
 use metrics::set_build_info_metric;
 use pageserver::{
     config::{defaults::*, PageServerConf},
+    context::{DownloadBehavior, RequestContext},
     http, page_cache, page_service, task_mgr,
     task_mgr::TaskKind,
     task_mgr::{
@@ -26,7 +27,7 @@ use utils::{
     logging,
     postgres_backend::AuthType,
     project_git_version,
-    sentry_init::{init_sentry, release_name},
+    sentry_init::init_sentry,
     signals::{self, Signal},
     tcp_listener,
 };
@@ -85,7 +86,10 @@ fn main() -> anyhow::Result<()> {
     };
 
     // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.id.to_string())]);
+    let _sentry_guard = init_sentry(
+        Some(GIT_VERSION.into()),
+        &[("node_id", &conf.id.to_string())],
+    );
 
     let tenants_path = conf.tenants_path();
     if !tenants_path.exists() {
@@ -246,7 +250,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
     let signals = signals::install_shutdown_handlers()?;
 
     // Launch broker client
-    WALRECEIVER_RUNTIME.block_on(pageserver::walreceiver::init_broker_client(conf))?;
+    WALRECEIVER_RUNTIME.block_on(pageserver::broker_client::init_broker_client(conf))?;
 
     // Initialize authentication for incoming connections
     let auth = match &conf.auth_type {
@@ -325,6 +329,13 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
         );
 
         if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+            let metrics_ctx = RequestContext::todo_child(
+                TaskKind::MetricsCollection,
+                // This task itself shouldn't download anything.
+                // The actual size calculation does need downloads, and
+                // creates a child context with the right DownloadBehavior.
+                DownloadBehavior::Error,
+            );
             task_mgr::spawn(
                 MGMT_REQUEST_RUNTIME.handle(),
                 TaskKind::MetricsCollection,
@@ -338,6 +349,7 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
                         conf.metric_collection_interval,
                         conf.synthetic_size_calculation_interval,
                         conf.id,
+                        metrics_ctx,
                     )
                     .instrument(info_span!("metrics_collection"))
                     .await?;
@@ -349,17 +361,34 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
 
     // Spawn a task to listen for libpq connections. It will spawn further tasks
     // for each connection. We created the listener earlier already.
-    task_mgr::spawn(
-        COMPUTE_REQUEST_RUNTIME.handle(),
-        TaskKind::LibpqEndpointListener,
-        None,
-        None,
-        "libpq endpoint listener",
-        true,
-        async move {
-            page_service::libpq_listener_main(conf, auth, pageserver_listener, conf.auth_type).await
-        },
-    );
+    {
+        let libpq_ctx = RequestContext::todo_child(
+            TaskKind::LibpqEndpointListener,
+            // listener task shouldn't need to download anything. (We will
+            // create a separate sub-contexts for each connection, with their
+            // own download behavior. This context is used only to listen and
+            // accept connections.)
+            DownloadBehavior::Error,
+        );
+        task_mgr::spawn(
+            COMPUTE_REQUEST_RUNTIME.handle(),
+            TaskKind::LibpqEndpointListener,
+            None,
+            None,
+            "libpq endpoint listener",
+            true,
+            async move {
+                page_service::libpq_listener_main(
+                    conf,
+                    auth,
+                    pageserver_listener,
+                    conf.auth_type,
+                    libpq_ctx,
+                )
+                .await
+            },
+        );
+    }
 
     // All started up! Now just sit and wait for shutdown signal.
     signals.handle(|signal| match signal {
diff --git a/pageserver/src/broker_client.rs b/pageserver/src/broker_client.rs
new file mode 100644
index 0000000000..6c92967ca3
--- /dev/null
+++ b/pageserver/src/broker_client.rs
@@ -0,0 +1,48 @@
+//! The broker client instance of the pageserver, created during pageserver startup.
+//! Used by each timelines' [`walreceiver`].
+
+use crate::config::PageServerConf;
+
+use anyhow::Context;
+use once_cell::sync::OnceCell;
+use storage_broker::BrokerClientChannel;
+use tracing::*;
+
+static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
+
+///
+/// Initialize the broker client. This must be called once at page server startup.
+///
+pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
+    let broker_endpoint = conf.broker_endpoint.clone();
+
+    // Note: we do not attempt connecting here (but validate endpoints sanity).
+    let broker_client =
+        storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
+            format!(
+                "Failed to create broker client to {}",
+                &conf.broker_endpoint
+            ),
+        )?;
+
+    if BROKER_CLIENT.set(broker_client).is_err() {
+        panic!("broker already initialized");
+    }
+
+    info!(
+        "Initialized broker client with endpoints: {}",
+        broker_endpoint
+    );
+    Ok(())
+}
+
+///
+/// Get a handle to the broker client
+///
+pub fn get_broker_client() -> &'static BrokerClientChannel {
+    BROKER_CLIENT.get().expect("broker client not initialized")
+}
+
+pub fn is_broker_client_initialized() -> bool {
+    BROKER_CLIENT.get().is_some()
+}
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 51d1664e52..a3b051279d 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -158,6 +158,8 @@ pub struct PageServerConf {
     pub synthetic_size_calculation_interval: Duration,
 
     pub test_remote_failures: u64,
+
+    pub ondemand_download_behavior_treat_error_as_warn: bool,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -222,6 +224,8 @@ struct PageServerConfigBuilder {
     synthetic_size_calculation_interval: BuilderValue<Duration>,
 
     test_remote_failures: BuilderValue<u64>,
+
+    ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -267,6 +271,8 @@ impl Default for PageServerConfigBuilder {
             metric_collection_endpoint: Set(DEFAULT_METRIC_COLLECTION_ENDPOINT),
 
             test_remote_failures: Set(0),
+
+            ondemand_download_behavior_treat_error_as_warn: Set(false),
         }
     }
 }
@@ -363,6 +369,14 @@ impl PageServerConfigBuilder {
         self.test_remote_failures = BuilderValue::Set(fail_first);
     }
 
+    pub fn ondemand_download_behavior_treat_error_as_warn(
+        &mut self,
+        ondemand_download_behavior_treat_error_as_warn: bool,
+    ) {
+        self.ondemand_download_behavior_treat_error_as_warn =
+            BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         Ok(PageServerConf {
             listen_pg_addr: self
@@ -422,6 +436,11 @@ impl PageServerConfigBuilder {
             test_remote_failures: self
                 .test_remote_failures
                 .ok_or(anyhow!("missing test_remote_failuers"))?,
+            ondemand_download_behavior_treat_error_as_warn: self
+                .ondemand_download_behavior_treat_error_as_warn
+                .ok_or(anyhow!(
+                    "missing ondemand_download_behavior_treat_error_as_warn"
+                ))?,
         })
     }
 }
@@ -600,6 +619,7 @@ impl PageServerConf {
                 "synthetic_size_calculation_interval" =>
                     builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
                 "test_remote_failures" => builder.test_remote_failures(parse_toml_u64(key, item)?),
+                "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -724,6 +744,7 @@ impl PageServerConf {
             metric_collection_endpoint: defaults::DEFAULT_METRIC_COLLECTION_ENDPOINT,
             synthetic_size_calculation_interval: Duration::from_secs(60),
             test_remote_failures: 0,
+            ondemand_download_behavior_treat_error_as_warn: false,
         }
     }
 }
@@ -749,6 +770,11 @@ fn parse_toml_u64(name: &str, item: &Item) -> Result<u64> {
     Ok(i as u64)
 }
 
+fn parse_toml_bool(name: &str, item: &Item) -> Result<bool> {
+    item.as_bool()
+        .with_context(|| format!("configure option {name} is not a bool"))
+}
+
 fn parse_toml_duration(name: &str, item: &Item) -> Result<Duration> {
     let s = item
         .as_str()
@@ -907,6 +933,7 @@ log_format = 'json'
                     defaults::DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL
                 )?,
                 test_remote_failures: 0,
+                ondemand_download_behavior_treat_error_as_warn: false,
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -954,6 +981,7 @@ log_format = 'json'
                 metric_collection_endpoint: Some(Url::parse("http://localhost:80/metrics")?),
                 synthetic_size_calculation_interval: Duration::from_secs(333),
                 test_remote_failures: 0,
+                ondemand_download_behavior_treat_error_as_warn: false,
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index c07026261d..d848ec5ee5 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,6 +3,7 @@
 //! and push them to a HTTP endpoint.
 //! Cache metrics to send only the updated ones.
 //!
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::mgr;
 use anyhow;
@@ -47,12 +48,15 @@ pub async fn collect_metrics(
     metric_collection_interval: Duration,
     synthetic_size_calculation_interval: Duration,
     node_id: NodeId,
+    ctx: RequestContext,
 ) -> anyhow::Result<()> {
     let mut ticker = tokio::time::interval(metric_collection_interval);
 
     info!("starting collect_metrics");
 
     // spin up background worker that caclulates tenant sizes
+    let worker_ctx =
+        ctx.detached_child(TaskKind::CalculateSyntheticSize, DownloadBehavior::Download);
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::CalculateSyntheticSize,
@@ -61,7 +65,7 @@ pub async fn collect_metrics(
         "synthetic size calculation",
         false,
         async move {
-            calculate_synthetic_size_worker(synthetic_size_calculation_interval)
+            calculate_synthetic_size_worker(synthetic_size_calculation_interval, &worker_ctx)
                 .instrument(info_span!("synthetic_size_worker"))
                 .await?;
             Ok(())
@@ -79,7 +83,7 @@ pub async fn collect_metrics(
                 return Ok(());
             },
             _ = ticker.tick() => {
-                if let Err(err) = collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id).await
+                if let Err(err) = collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx).await
                 {
                     error!("metrics collection failed: {err:?}");
                 }
@@ -102,6 +106,7 @@ pub async fn collect_metrics_iteration(
     cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
     metric_collection_endpoint: &reqwest::Url,
     node_id: NodeId,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
     trace!(
@@ -110,7 +115,7 @@ pub async fn collect_metrics_iteration(
     );
 
     // get list of tenants
-    let tenants = mgr::list_tenants().await;
+    let tenants = mgr::list_tenants().await?;
 
     // iterate through list of Active tenants and collect metrics
     for (tenant_id, tenant_state) in tenants {
@@ -137,7 +142,7 @@ pub async fn collect_metrics_iteration(
                     timeline_written_size,
                 ));
 
-                let (timeline_logical_size, is_exact) = timeline.get_current_logical_size()?;
+                let (timeline_logical_size, is_exact) = timeline.get_current_logical_size(ctx)?;
                 // Only send timeline logical size when it is fully calculated.
                 if is_exact {
                     current_metrics.push((
@@ -258,6 +263,7 @@ pub async fn collect_metrics_iteration(
 /// Caclculate synthetic size for each active tenant
 pub async fn calculate_synthetic_size_worker(
     synthetic_size_calculation_interval: Duration,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     info!("starting calculate_synthetic_size_worker");
 
@@ -270,7 +276,13 @@ pub async fn calculate_synthetic_size_worker(
             },
         _ = ticker.tick() => {
 
-                let tenants = mgr::list_tenants().await;
+                let tenants = match mgr::list_tenants().await {
+                    Ok(tenants) => tenants,
+                    Err(e) => {
+                        warn!("cannot get tenant list: {e:#}");
+                        continue;
+                    }
+                };
                 // iterate through list of Active tenants and collect metrics
                 for (tenant_id, tenant_state) in tenants {
 
@@ -280,7 +292,7 @@ pub async fn calculate_synthetic_size_worker(
 
                     if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await
                     {
-                        if let Err(e) = tenant.calculate_synthetic_size().await {
+                        if let Err(e) = tenant.calculate_synthetic_size(ctx).await {
                             error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e);
                         }
                     }
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
new file mode 100644
index 0000000000..e826d28e6d
--- /dev/null
+++ b/pageserver/src/context.rs
@@ -0,0 +1,199 @@
+//! This module defines `RequestContext`, a structure that we use throughout
+//! the pageserver to propagate high-level context from places
+//! that _originate_ activity down to the shared code paths at the
+//! heart of the pageserver. It's inspired by Golang's `context.Context`.
+//!
+//! For example, in `Timeline::get(page_nr, lsn)` we need to answer the following questions:
+//! 1. What high-level activity ([`TaskKind`]) needs this page?
+//!    We need that information as a categorical dimension for page access
+//!    statistics, which we, in turn, need to guide layer eviction policy design.
+//! 2. How should we behave if, to produce the page image, we need to
+//!    on-demand download a layer file ([`DownloadBehavior`]).
+//!
+//! [`RequestContext`] satisfies those needs.
+//! The current implementation is a small `struct` that is passed through
+//! the call chain by reference.
+//!
+//! ### Future Work
+//!
+//! However, we do not intend to stop here, since there are other needs that
+//! require carrying information from high to low levels of the app.
+//!
+//! Most importantly, **cancellation signaling** in response to
+//! 1. timeouts (page_service max response time) and
+//! 2. lifecycle requests (detach tenant, delete timeline).
+//!
+//! Related to that, there is sometimes a need to ensure that all tokio tasks spawned
+//! by the transitive callees of a request have finished. The keyword here
+//! is **Structured Concurrency**, and right now, we use `task_mgr` in most places,
+//! `TaskHandle` in some places, and careful code review around `FuturesUnordered`
+//! or `JoinSet` in other places.
+//!
+//! We do not yet have a systematic cancellation story in pageserver, and it is
+//! pretty clear that [`RequestContext`] will be responsible for that.
+//! So, the API already prepares for this role through the
+//! [`RequestContext::detached_child`] and [`RequestContext::attached_child`]  methods.
+//! See their doc comments for details on how we will use them in the future.
+//!
+//! It is not clear whether or how we will enforce Structured Concurrency, and
+//! what role [`RequestContext`] will play there.
+//! So, the API doesn't prepare us for this topic.
+//!
+//! Other future uses of `RequestContext`:
+//! - Communicate compute & IO priorities (user-initiated request vs. background-loop)
+//! - Request IDs for distributed tracing
+//! - Request/Timeline/Tenant-scoped log levels
+//!
+//! RequestContext might look quite different once it supports those features.
+//! Likely, it will have a shape similar to Golang's `context.Context`.
+//!
+//! ### Why A Struct Instead Of Method Parameters
+//!
+//! What's typical about such information is that it needs to be passed down
+//! along the call chain from high level to low level, but few of the functions
+//! in the middle need to understand it.
+//! Further, it is to be expected that we will need to propagate more data
+//! in the future (see the earlier section on future work).
+//! Hence, for functions in the middle of the call chain, we have the following
+//! requirements:
+//! 1. It should be easy to forward the context to callees.
+//! 2. To propagate more data from high-level to low-level code, the functions in
+//!    the middle should not need to be modified.
+//! The solution is to have a container structure ([`RequestContext`]) that
+//! carries the information. Functions that don't care about what's in it
+//! pass it along to callees.
+//!
+//! ### Why Not Task-Local Variables
+//!
+//! One could use task-local variables (the equivalent of thread-local variables)
+//! to address the immediate needs outlined above.
+//! However, we reject task-local variables because:
+//! 1. they are implicit, thereby making it harder to trace the data flow in code
+//!    reviews and during debugging,
+//! 2. they can be mutable, which enables implicit return data flow,
+//! 3. they are restrictive in that code which fans out into multiple tasks,
+//!    or even threads, needs to carefully propagate the state.
+//!
+//! In contrast, information flow with [`RequestContext`] is
+//! 1. always explicit,
+//! 2. strictly uni-directional because RequestContext is immutable,
+//! 3. tangible because a [`RequestContext`] is just a value.
+//!    When creating child activities, regardless of whether it's a task,
+//!    thread, or even an RPC to another service, the value can
+//!    be used like any other argument.
+//!
+//! The solution is that all code paths are infected with precisely one
+//! [`RequestContext`] argument. Functions in the middle of the call chain
+//! only need to pass it on.
+use crate::task_mgr::TaskKind;
+
+// The main structure of this module, see module-level comment.
+pub struct RequestContext {
+    task_kind: TaskKind,
+    download_behavior: DownloadBehavior,
+}
+
+/// Desired behavior if the operation requires an on-demand download
+/// to proceed.
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub enum DownloadBehavior {
+    /// Download the layer file. It can take a while.
+    Download,
+
+    /// Download the layer file, but print a warning to the log. This should be used
+    /// in code where the layer file is expected to already exist locally.
+    Warn,
+
+    /// Return a PageReconstructError::NeedsDownload error
+    Error,
+}
+
+impl RequestContext {
+    /// Create a new RequestContext that has no parent.
+    ///
+    /// The function is called `new` because, once we add children
+    /// to it using `detached_child` or `attached_child`, the context
+    /// form a tree (not implemented yet since cancellation will be
+    /// the first feature that requires a tree).
+    ///
+    /// # Future: Cancellation
+    ///
+    /// The only reason why a context like this one can be canceled is
+    /// because someone explicitly canceled it.
+    /// It has no parent, so it cannot inherit cancellation from there.
+    pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+        RequestContext {
+            task_kind,
+            download_behavior,
+        }
+    }
+
+    /// Create a detached child context for a task that may outlive `self`.
+    ///
+    /// Use this when spawning new background activity that should complete
+    /// even if the current request is canceled.
+    ///
+    /// # Future: Cancellation
+    ///
+    /// Cancellation of `self` will not propagate to the child context returned
+    /// by this method.
+    ///
+    /// # Future: Structured Concurrency
+    ///
+    /// We could add the Future as a parameter to this function, spawn it as a task,
+    /// and pass to the new task the child context as an argument.
+    /// That would be an ergonomic improvement.
+    ///
+    /// We could make new calls to this function fail if `self` is already canceled.
+    pub fn detached_child(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+        self.child_impl(task_kind, download_behavior)
+    }
+
+    /// Create a child of context `self` for a task that shall not outlive `self`.
+    ///
+    /// Use this when fanning-out work to other async tasks.
+    ///
+    /// # Future: Cancellation
+    ///
+    /// Cancelling a context will propagate to its attached children.
+    ///
+    /// # Future: Structured Concurrency
+    ///
+    /// We could add the Future as a parameter to this function, spawn it as a task,
+    /// and track its `JoinHandle` inside the `RequestContext`.
+    ///
+    /// We could then provide another method to allow waiting for all child tasks
+    /// to finish.
+    ///
+    /// We could make new calls to this function fail if `self` is already canceled.
+    /// Alternatively, we could allow the creation but not spawn the task.
+    /// The method to wait for child tasks would return an error, indicating
+    /// that the child task was not started because the context was canceled.
+    pub fn attached_child(&self) -> Self {
+        self.child_impl(self.task_kind(), self.download_behavior())
+    }
+
+    /// Use this function when you should be creating a child context using
+    /// [`attached_child`] or [`detached_child`], but your caller doesn't provide
+    /// a context and you are unwilling to change all callers to provide one.
+    ///
+    /// Before we add cancellation, we should get rid of this method.
+    pub fn todo_child(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+        Self::new(task_kind, download_behavior)
+    }
+
+    fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
+        RequestContext {
+            task_kind,
+            download_behavior,
+        }
+    }
+
+    pub fn task_kind(&self) -> TaskKind {
+        self.task_kind
+    }
+
+    pub fn download_behavior(&self) -> DownloadBehavior {
+        self.download_behavior
+    }
+}
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index f9b8a81dad..23faff7ace 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -430,6 +430,13 @@ paths:
         schema:
           type: string
           format: hex
+      - name: inputs_only
+        in: query
+        required: false
+        schema:
+          type: boolean
+        description: |
+          When true, skip calculation and only provide the model inputs (for debugging). Defaults to false.
     get:
       description: |
         Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
@@ -449,8 +456,9 @@ paths:
                     format: hex
                   size:
                     type: integer
+                    nullable: true
                     description: |
-                      Size metric in bytes.
+                      Size metric in bytes or null if inputs_only=true was given.
         "401":
           description: Unauthorized Error
           content:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 1eb24c1507..a7802f3cbe 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -12,8 +12,11 @@ use super::models::{
     StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
     TimelineCreateRequest, TimelineInfo,
 };
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::pgdatadir_mapping::LsnForTimestamp;
+use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
+use crate::tenant::mgr::TenantMapInsertError;
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::{config::PageServerConf, tenant::mgr};
 use utils::{
@@ -81,18 +84,39 @@ fn check_permission(request: &Request<Body>, tenant_id: Option<TenantId>) -> Res
 fn apierror_from_prerror(err: PageReconstructError) -> ApiError {
     match err {
         PageReconstructError::Other(err) => ApiError::InternalServerError(err),
+        PageReconstructError::NeedsDownload(_, _) => {
+            // This shouldn't happen, because we use a RequestContext that requests to
+            // download any missing layer files on-demand.
+            ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
+        }
+        PageReconstructError::Cancelled => {
+            ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
+        }
         PageReconstructError::WalRedo(err) => {
             ApiError::InternalServerError(anyhow::Error::new(err))
         }
     }
 }
 
+fn apierror_from_tenant_map_insert_error(e: TenantMapInsertError) -> ApiError {
+    match e {
+        TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
+            ApiError::InternalServerError(anyhow::Error::new(e))
+        }
+        TenantMapInsertError::TenantAlreadyExists(id, state) => {
+            ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
+        }
+        TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e),
+    }
+}
+
 // Helper function to construct a TimelineInfo struct for a timeline
 async fn build_timeline_info(
     timeline: &Arc<Timeline>,
     include_non_incremental_logical_size: bool,
+    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
-    let mut info = build_timeline_info_common(timeline)?;
+    let mut info = build_timeline_info_common(timeline, ctx)?;
     if include_non_incremental_logical_size {
         // XXX we should be using spawn_ondemand_logical_size_calculation here.
         // Otherwise, if someone deletes the timeline / detaches the tenant while
@@ -102,6 +126,7 @@ async fn build_timeline_info(
                 .get_current_logical_size_non_incremental(
                     info.last_record_lsn,
                     CancellationToken::new(),
+                    ctx,
                 )
                 .await?,
         );
@@ -109,7 +134,10 @@ async fn build_timeline_info(
     Ok(info)
 }
 
-fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<TimelineInfo> {
+fn build_timeline_info_common(
+    timeline: &Arc<Timeline>,
+    ctx: &RequestContext,
+) -> anyhow::Result<TimelineInfo> {
     let last_record_lsn = timeline.get_last_record_lsn();
     let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
         let guard = timeline.last_received_wal.lock().unwrap();
@@ -129,7 +157,7 @@ fn build_timeline_info_common(timeline: &Arc<Timeline>) -> anyhow::Result<Timeli
         Lsn(0) => None,
         lsn @ Lsn(_) => Some(lsn),
     };
-    let current_logical_size = match timeline.get_current_logical_size() {
+    let current_logical_size = match timeline.get_current_logical_size(ctx) {
         Ok((size, _)) => Some(size),
         Err(err) => {
             error!("Timeline info creation failed to get current logical size: {err:?}");
@@ -180,6 +208,8 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
         .new_timeline_id
         .unwrap_or_else(TimelineId::generate);
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
+
     let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
@@ -187,13 +217,14 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
         new_timeline_id,
         request_data.ancestor_timeline_id.map(TimelineId::from),
         request_data.ancestor_start_lsn,
-        request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION)
+        request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
+        &ctx,
     )
     .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
     .await {
         Ok(Some(new_timeline)) => {
             // Created. Construct a TimelineInfo for it.
-            let timeline_info = build_timeline_info_common(&new_timeline)
+            let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
                 .map_err(ApiError::InternalServerError)?;
             json_response(StatusCode::CREATED, timeline_info)
         }
@@ -208,6 +239,8 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
         query_param_present(&request, "include-non-incremental-logical-size");
     check_permission(&request, Some(tenant_id))?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+
     let response_data = async {
         let tenant = mgr::get_tenant(tenant_id, true)
             .await
@@ -217,7 +250,7 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
         let mut response_data = Vec::with_capacity(timelines.len());
         for timeline in timelines {
             let timeline_info =
-                build_timeline_info(&timeline, include_non_incremental_logical_size)
+                build_timeline_info(&timeline, include_non_incremental_logical_size, &ctx)
                     .await
                     .context(
                         "Failed to convert tenant timeline {timeline_id} into the local one: {e:?}",
@@ -239,11 +272,7 @@ fn query_param_present(request: &Request<Body>, param: &str) -> bool {
     request
         .uri()
         .query()
-        .map(|v| {
-            url::form_urlencoded::parse(v.as_bytes())
-                .into_owned()
-                .any(|(p, _)| p == param)
-        })
+        .map(|v| url::form_urlencoded::parse(v.as_bytes()).any(|(p, _)| p == param))
         .unwrap_or(false)
 }
 
@@ -252,13 +281,12 @@ fn get_query_param(request: &Request<Body>, param_name: &str) -> Result<String,
         Err(ApiError::BadRequest(anyhow!("empty query in request"))),
         |v| {
             url::form_urlencoded::parse(v.as_bytes())
-                .into_owned()
                 .find(|(k, _)| k == param_name)
                 .map_or(
                     Err(ApiError::BadRequest(anyhow!(
                         "no {param_name} specified in query parameters"
                     ))),
-                    |(_, v)| Ok(v),
+                    |(_, v)| Ok(v.into_owned()),
                 )
         },
     )
@@ -271,6 +299,9 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
         query_param_present(&request, "include-non-incremental-logical-size");
     check_permission(&request, Some(tenant_id))?;
 
+    // Logical size calculation needs downloading.
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+
     let timeline_info = async {
         let tenant = mgr::get_tenant(tenant_id, true)
             .await
@@ -280,10 +311,11 @@ async fn timeline_detail_handler(request: Request<Body>) -> Result<Response<Body
             .get_timeline(timeline_id, false)
             .map_err(ApiError::NotFound)?;
 
-        let timeline_info = build_timeline_info(&timeline, include_non_incremental_logical_size)
-            .await
-            .context("Failed to get local timeline info: {e:#}")
-            .map_err(ApiError::InternalServerError)?;
+        let timeline_info =
+            build_timeline_info(&timeline, include_non_incremental_logical_size, &ctx)
+                .await
+                .context("get local timeline info")
+                .map_err(ApiError::InternalServerError)?;
 
         Ok::<_, ApiError>(timeline_info)
     }
@@ -304,12 +336,13 @@ async fn get_lsn_by_timestamp_handler(request: Request<Body>) -> Result<Response
         .map_err(ApiError::BadRequest)?;
     let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
     let timeline = mgr::get_tenant(tenant_id, true)
         .await
         .and_then(|tenant| tenant.get_timeline(timeline_id, true))
         .map_err(ApiError::NotFound)?;
     let result = timeline
-        .find_lsn_for_timestamp(timestamp_pg)
+        .find_lsn_for_timestamp(timestamp_pg, &ctx)
         .await
         .map_err(apierror_from_prerror)?;
 
@@ -327,16 +360,17 @@ async fn tenant_attach_handler(request: Request<Body>) -> Result<Response<Body>,
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
     info!("Handling tenant attach {tenant_id}");
 
     let state = get_state(&request);
 
     if let Some(remote_storage) = &state.remote_storage {
-        // FIXME: distinguish between "Tenant already exists" and other errors
-        mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone())
+        mgr::attach_tenant(state.conf, tenant_id, remote_storage.clone(), &ctx)
             .instrument(info_span!("tenant_attach", tenant = %tenant_id))
             .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(apierror_from_tenant_map_insert_error)?;
     } else {
         return Err(ApiError::BadRequest(anyhow!(
             "attach_tenant is not possible because pageserver was configured without remote storage"
@@ -351,7 +385,9 @@ async fn timeline_delete_handler(request: Request<Body>) -> Result<Response<Body
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    mgr::delete_timeline(tenant_id, timeline_id)
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    mgr::delete_timeline(tenant_id, timeline_id, &ctx)
         .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
         .await
         // FIXME: Errors from `delete_timeline` can occur for a number of reasons, incuding both
@@ -382,11 +418,13 @@ async fn tenant_load_handler(request: Request<Body>) -> Result<Response<Body>, A
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
     let state = get_state(&request);
-    mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone())
+    mgr::load_tenant(state.conf, tenant_id, state.remote_storage.clone(), &ctx)
         .instrument(info_span!("load", tenant = %tenant_id))
         .await
-        .map_err(ApiError::InternalServerError)?;
+        .map_err(apierror_from_tenant_map_insert_error)?;
 
     json_response(StatusCode::ACCEPTED, ())
 }
@@ -413,6 +451,8 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
     let response_data = mgr::list_tenants()
         .instrument(info_span!("tenant_list"))
         .await
+        .map_err(anyhow::Error::new)
+        .map_err(ApiError::InternalServerError)?
         .iter()
         .map(|(id, state)| TenantInfo {
             id: *id,
@@ -453,21 +493,40 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
     json_response(StatusCode::OK, tenant_info)
 }
 
+/// HTTP endpoint to query the current tenant_size of a tenant.
+///
+/// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
+/// to debug any of the calculations. Requires `tenant_id` request parameter, supports
+/// `inputs_only=true|false` (default false) which supports debugging failure to calculate model
+/// values.
 async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
 
+    let inputs_only = if query_param_present(&request, "inputs_only") {
+        get_query_param(&request, "inputs_only")?
+            .parse()
+            .map_err(|_| ApiError::BadRequest(anyhow!("failed to parse inputs_only")))?
+    } else {
+        false
+    };
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
     let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::InternalServerError)?;
 
-    // this can be long operation, it currently is not backed by any request coalescing or similar
+    // this can be long operation
     let inputs = tenant
-        .gather_size_inputs()
+        .gather_size_inputs(&ctx)
         .await
         .map_err(ApiError::InternalServerError)?;
 
-    let size = inputs.calculate().map_err(ApiError::InternalServerError)?;
+    let size = if !inputs_only {
+        Some(inputs.calculate().map_err(ApiError::InternalServerError)?)
+    } else {
+        None
+    };
 
     /// Private response type with the additional "unstable" `inputs` field.
     ///
@@ -479,7 +538,9 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
         #[serde_as(as = "serde_with::DisplayFromStr")]
         id: TenantId,
         /// Size is a mixture of WAL and logical size, so the unit is bytes.
-        size: u64,
+        ///
+        /// Will be none if `?inputs_only=true` was given.
+        size: Option<u64>,
         inputs: crate::tenant::size::ModelInputs,
     }
 
@@ -506,6 +567,8 @@ fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn()
 async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
     let request_data: TenantCreateRequest = json_request(&mut request).await?;
 
     let mut tenant_conf = TenantConfOpt::default();
@@ -583,34 +646,28 @@ async fn tenant_create_handler(mut request: Request<Body>) -> Result<Response<Bo
         tenant_conf,
         target_tenant_id,
         state.remote_storage.clone(),
+        &ctx,
     )
     .instrument(info_span!("tenant_create", tenant = ?target_tenant_id))
     .await
-    // FIXME: `create_tenant` can fail from both user and internal errors. Replace this
-    // with better error handling once the type permits it
-    .map_err(ApiError::InternalServerError)?;
+    .map_err(apierror_from_tenant_map_insert_error)?;
 
-    Ok(match new_tenant {
-        Some(tenant) => {
-            // We created the tenant. Existing API semantics are that the tenant
-            // is Active when this function returns.
-            if let res @ Err(_) = tenant.wait_to_become_active().await {
-                // This shouldn't happen because we just created the tenant directory
-                // in tenant::mgr::create_tenant, and there aren't any remote timelines
-                // to load, so, nothing can really fail during load.
-                // Don't do cleanup because we don't know how we got here.
-                // The tenant will likely be in `Broken` state and subsequent
-                // calls will fail.
-                res.context("created tenant failed to become active")
-                    .map_err(ApiError::InternalServerError)?;
-            }
-            json_response(
-                StatusCode::CREATED,
-                TenantCreateResponse(tenant.tenant_id()),
-            )?
-        }
-        None => json_response(StatusCode::CONFLICT, ())?,
-    })
+    // We created the tenant. Existing API semantics are that the tenant
+    // is Active when this function returns.
+    if let res @ Err(_) = new_tenant.wait_to_become_active().await {
+        // This shouldn't happen because we just created the tenant directory
+        // in tenant::mgr::create_tenant, and there aren't any remote timelines
+        // to load, so, nothing can really fail during load.
+        // Don't do cleanup because we don't know how we got here.
+        // The tenant will likely be in `Broken` state and subsequent
+        // calls will fail.
+        res.context("created tenant failed to become active")
+            .map_err(ApiError::InternalServerError)?;
+    }
+    json_response(
+        StatusCode::CREATED,
+        TenantCreateResponse(new_tenant.tenant_id()),
+    )
 }
 
 async fn tenant_config_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -732,7 +789,8 @@ async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body
 
     let gc_req: TimelineGcRequest = json_request(&mut request).await?;
 
-    let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req).await?;
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, &ctx).await?;
     let gc_result = wait_task_done
         .await
         .context("wait for gc task")
@@ -749,7 +807,8 @@ async fn timeline_compact_handler(request: Request<Body>) -> Result<Response<Bod
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
-    let result_receiver = mgr::immediate_compact(tenant_id, timeline_id)
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let result_receiver = mgr::immediate_compact(tenant_id, timeline_id, &ctx)
         .await
         .context("spawn compaction task")
         .map_err(ApiError::InternalServerError)?;
@@ -770,6 +829,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_id))?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
     let tenant = mgr::get_tenant(tenant_id, true)
         .await
         .map_err(ApiError::NotFound)?;
@@ -781,7 +841,7 @@ async fn timeline_checkpoint_handler(request: Request<Body>) -> Result<Response<
         .await
         .map_err(ApiError::InternalServerError)?;
     timeline
-        .compact()
+        .compact(&ctx)
         .await
         .map_err(ApiError::InternalServerError)?;
 
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 3fd4bf12ca..39e434a023 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -12,6 +12,7 @@ use tokio_tar::Archive;
 use tracing::*;
 use walkdir::WalkDir;
 
+use crate::context::RequestContext;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
@@ -47,6 +48,7 @@ pub async fn import_timeline_from_postgres_datadir(
     tline: &Timeline,
     pgdata_path: &Path,
     pgdata_lsn: Lsn,
+    ctx: &RequestContext,
 ) -> Result<()> {
     let mut pg_control: Option<ControlFileData> = None;
 
@@ -69,7 +71,7 @@ pub async fn import_timeline_from_postgres_datadir(
             let mut file = tokio::fs::File::open(absolute_path).await?;
             let len = metadata.len() as usize;
             if let Some(control_file) =
-                import_file(&mut modification, relative_path, &mut file, len).await?
+                import_file(&mut modification, relative_path, &mut file, len, ctx).await?
             {
                 pg_control = Some(control_file);
             }
@@ -99,6 +101,7 @@ pub async fn import_timeline_from_postgres_datadir(
         tline,
         Lsn(pg_control.checkPointCopy.redo),
         pgdata_lsn,
+        ctx,
     )
     .await?;
 
@@ -113,6 +116,7 @@ async fn import_rel(
     dboid: Oid,
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     // Does it look like a relation file?
     trace!("importing rel file {}", path.display());
@@ -147,7 +151,10 @@ async fn import_rel(
     // FIXME: use proper error type for this, instead of parsing the error message.
     // Or better yet, keep track of which relations we've already created
     // https://github.com/neondatabase/neon/issues/3309
-    if let Err(e) = modification.put_rel_creation(rel, nblocks as u32).await {
+    if let Err(e) = modification
+        .put_rel_creation(rel, nblocks as u32, ctx)
+        .await
+    {
         if e.to_string().contains("already exists") {
             debug!("relation {} already exists. we must be extending it", rel);
         } else {
@@ -182,7 +189,7 @@ async fn import_rel(
     //
     // If we process rel segments out of order,
     // put_rel_extend will skip the update.
-    modification.put_rel_extend(rel, blknum).await?;
+    modification.put_rel_extend(rel, blknum, ctx).await?;
 
     Ok(())
 }
@@ -195,6 +202,7 @@ async fn import_slru(
     path: &Path,
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     info!("importing slru file {path:?}");
 
@@ -211,7 +219,7 @@ async fn import_slru(
     ensure!(nblocks <= pg_constants::SLRU_PAGES_PER_SEGMENT as usize);
 
     modification
-        .put_slru_segment_creation(slru, segno, nblocks as u32)
+        .put_slru_segment_creation(slru, segno, nblocks as u32, ctx)
         .await?;
 
     let mut rpageno = 0;
@@ -252,15 +260,15 @@ async fn import_wal(
     tline: &Timeline,
     startpoint: Lsn,
     endpoint: Lsn,
+    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
-    use std::io::Read;
     let mut waldecoder = WalStreamDecoder::new(startpoint, tline.pg_version);
 
     let mut segno = startpoint.segment_number(WAL_SEGMENT_SIZE);
     let mut offset = startpoint.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = startpoint;
 
-    let mut walingest = WalIngest::new(tline, startpoint).await?;
+    let mut walingest = WalIngest::new(tline, startpoint, ctx).await?;
 
     while last_lsn <= endpoint {
         // FIXME: assume postgresql tli 1 for now
@@ -283,6 +291,7 @@ async fn import_wal(
             file.seek(std::io::SeekFrom::Start(offset as u64))?;
         }
 
+        use std::io::Read;
         let nread = file.read_to_end(&mut buf)?;
         if nread != WAL_SEGMENT_SIZE - offset {
             // Maybe allow this for .partial files?
@@ -297,7 +306,7 @@ async fn import_wal(
         while last_lsn <= endpoint {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                     .await?;
                 last_lsn = lsn;
 
@@ -326,6 +335,7 @@ pub async fn import_basebackup_from_tar(
     tline: &Timeline,
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     base_lsn: Lsn,
+    ctx: &RequestContext,
 ) -> Result<()> {
     info!("importing base at {base_lsn}");
     let mut modification = tline.begin_modification(base_lsn);
@@ -344,7 +354,7 @@ pub async fn import_basebackup_from_tar(
         match header.entry_type() {
             tokio_tar::EntryType::Regular => {
                 if let Some(res) =
-                    import_file(&mut modification, file_path.as_ref(), &mut entry, len).await?
+                    import_file(&mut modification, file_path.as_ref(), &mut entry, len, ctx).await?
                 {
                     // We found the pg_control file.
                     pg_control = Some(res);
@@ -376,13 +386,14 @@ pub async fn import_wal_from_tar(
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     start_lsn: Lsn,
     end_lsn: Lsn,
+    ctx: &RequestContext,
 ) -> Result<()> {
     // Set up walingest mutable state
     let mut waldecoder = WalStreamDecoder::new(start_lsn, tline.pg_version);
     let mut segno = start_lsn.segment_number(WAL_SEGMENT_SIZE);
     let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = start_lsn;
-    let mut walingest = WalIngest::new(tline, start_lsn).await?;
+    let mut walingest = WalIngest::new(tline, start_lsn, ctx).await?;
 
     // Ingest wal until end_lsn
     info!("importing wal until {}", end_lsn);
@@ -431,7 +442,7 @@ pub async fn import_wal_from_tar(
         while last_lsn <= end_lsn {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 walingest
-                    .ingest_record(recdata, lsn, &mut modification, &mut decoded)
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                     .await?;
                 last_lsn = lsn;
 
@@ -466,6 +477,7 @@ async fn import_file(
     file_path: &Path,
     reader: &mut (impl AsyncRead + Send + Sync + Unpin),
     len: usize,
+    ctx: &RequestContext,
 ) -> Result<Option<ControlFileData>> {
     let file_name = match file_path.file_name() {
         Some(name) => name.to_string_lossy(),
@@ -498,14 +510,16 @@ async fn import_file(
             }
             "pg_filenode.map" => {
                 let bytes = read_all_bytes(reader).await?;
-                modification.put_relmap_file(spcnode, dbnode, bytes).await?;
+                modification
+                    .put_relmap_file(spcnode, dbnode, bytes, ctx)
+                    .await?;
                 debug!("imported relmap file")
             }
             "PG_VERSION" => {
                 debug!("ignored PG_VERSION file");
             }
             _ => {
-                import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
+                import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?;
                 debug!("imported rel creation");
             }
         }
@@ -521,38 +535,40 @@ async fn import_file(
         match file_name.as_ref() {
             "pg_filenode.map" => {
                 let bytes = read_all_bytes(reader).await?;
-                modification.put_relmap_file(spcnode, dbnode, bytes).await?;
+                modification
+                    .put_relmap_file(spcnode, dbnode, bytes, ctx)
+                    .await?;
                 debug!("imported relmap file")
             }
             "PG_VERSION" => {
                 debug!("ignored PG_VERSION file");
             }
             _ => {
-                import_rel(modification, file_path, spcnode, dbnode, reader, len).await?;
+                import_rel(modification, file_path, spcnode, dbnode, reader, len, ctx).await?;
                 debug!("imported rel creation");
             }
         }
     } else if file_path.starts_with("pg_xact") {
         let slru = SlruKind::Clog;
 
-        import_slru(modification, slru, file_path, reader, len).await?;
+        import_slru(modification, slru, file_path, reader, len, ctx).await?;
         debug!("imported clog slru");
     } else if file_path.starts_with("pg_multixact/offsets") {
         let slru = SlruKind::MultiXactOffsets;
 
-        import_slru(modification, slru, file_path, reader, len).await?;
+        import_slru(modification, slru, file_path, reader, len, ctx).await?;
         debug!("imported multixact offsets slru");
     } else if file_path.starts_with("pg_multixact/members") {
         let slru = SlruKind::MultiXactMembers;
 
-        import_slru(modification, slru, file_path, reader, len).await?;
+        import_slru(modification, slru, file_path, reader, len, ctx).await?;
         debug!("imported multixact members slru");
     } else if file_path.starts_with("pg_twophase") {
         let xid = u32::from_str_radix(file_name.as_ref(), 16)?;
 
         let bytes = read_all_bytes(reader).await?;
         modification
-            .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]))
+            .put_twophase_file(xid, Bytes::copy_from_slice(&bytes[..]), ctx)
             .await?;
         debug!("imported twophase file");
     } else if file_path.starts_with("pg_wal") {
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 91cde477ad..09e21ae755 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,7 +1,9 @@
 mod auth;
 pub mod basebackup;
+pub mod broker_client;
 pub mod config;
 pub mod consumption_metrics;
+pub mod context;
 pub mod http;
 pub mod import_datadir;
 pub mod keyspace;
@@ -15,7 +17,6 @@ pub mod tenant;
 pub mod trace;
 pub mod virtual_file;
 pub mod walingest;
-pub mod walreceiver;
 pub mod walrecord;
 pub mod walredo;
 
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index b61e64048b..6bd0eddbb5 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,10 +1,12 @@
 use metrics::core::{AtomicU64, GenericCounter};
 use metrics::{
-    register_histogram, register_histogram_vec, register_int_counter, register_int_counter_vec,
-    register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec, Histogram, HistogramVec,
-    IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
+    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
+    Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
+    UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
+use pageserver_api::models::state;
 use utils::id::{TenantId, TimelineId};
 
 /// Prometheus histogram buckets (in seconds) that capture the majority of
@@ -35,11 +37,29 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[
     "gc",
 ];
 
-pub static STORAGE_TIME: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "pageserver_storage_operations_seconds",
-        "Time spent on storage operations",
+pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
+    register_counter_vec!(
+        "pageserver_storage_operations_seconds_sum",
+        "Total time spent on storage operations with operation, tenant and timeline dimensions",
         &["operation", "tenant_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
+pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_storage_operations_seconds_count",
+        "Count of storage operations with operation, tenant and timeline dimensions",
+        &["operation", "tenant_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
+pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_storage_operations_seconds_global",
+        "Time spent on storage operations",
+        &["operation"],
         get_buckets_for_critical_operations(),
     )
     .expect("failed to define a metric")
@@ -112,6 +132,24 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define current logical size metric")
 });
 
+// Metrics collected on tenant states.
+const TENANT_STATE_OPTIONS: &[&str] = &[
+    state::LOADING,
+    state::ATTACHING,
+    state::ACTIVE,
+    state::STOPPING,
+    state::BROKEN,
+];
+
+pub static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_tenant_states_count",
+        "Count of tenants per state",
+        &["tenant_id", "state"]
+    )
+    .expect("Failed to register pageserver_tenant_states_count metric")
+});
+
 // Metrics for cloud upload. These metrics reflect data uploaded to cloud storage,
 // or in testing they estimate how much we would upload if we did.
 static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -375,18 +413,81 @@ pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
     .unwrap()
 });
 
+/// Similar to [`prometheus::HistogramTimer`] but does not record on drop.
+pub struct StorageTimeMetricsTimer {
+    metrics: StorageTimeMetrics,
+    start: Instant,
+}
+
+impl StorageTimeMetricsTimer {
+    fn new(metrics: StorageTimeMetrics) -> Self {
+        Self {
+            metrics,
+            start: Instant::now(),
+        }
+    }
+
+    /// Record the time from creation to now.
+    pub fn stop_and_record(self) {
+        let duration = self.start.elapsed().as_secs_f64();
+        self.metrics.timeline_sum.inc_by(duration);
+        self.metrics.timeline_count.inc();
+        self.metrics.global_histogram.observe(duration);
+    }
+}
+
+/// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
+/// timeline total sum and count.
+#[derive(Clone, Debug)]
+pub struct StorageTimeMetrics {
+    /// Sum of f64 seconds, per operation, tenant_id and timeline_id
+    timeline_sum: Counter,
+    /// Number of oeprations, per operation, tenant_id and timeline_id
+    timeline_count: IntCounter,
+    /// Global histogram having only the "operation" label.
+    global_histogram: Histogram,
+}
+
+impl StorageTimeMetrics {
+    pub fn new(operation: &str, tenant_id: &str, timeline_id: &str) -> Self {
+        let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
+            .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
+            .unwrap();
+        let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
+            .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
+            .unwrap();
+        let global_histogram = STORAGE_TIME_GLOBAL
+            .get_metric_with_label_values(&[operation])
+            .unwrap();
+
+        StorageTimeMetrics {
+            timeline_sum,
+            timeline_count,
+            global_histogram,
+        }
+    }
+
+    /// Starts timing a new operation.
+    ///
+    /// Note: unlike [`prometheus::HistogramTimer`] the returned timer does not record on drop.
+    pub fn start_timer(&self) -> StorageTimeMetricsTimer {
+        StorageTimeMetricsTimer::new(self.clone())
+    }
+}
+
 #[derive(Debug)]
 pub struct TimelineMetrics {
     tenant_id: String,
     timeline_id: String,
     pub reconstruct_time_histo: Histogram,
     pub materialized_page_cache_hit_counter: GenericCounter<AtomicU64>,
-    pub flush_time_histo: Histogram,
-    pub compact_time_histo: Histogram,
-    pub create_images_time_histo: Histogram,
-    pub init_logical_size_histo: Histogram,
-    pub logical_size_histo: Histogram,
-    pub load_layer_map_histo: Histogram,
+    pub flush_time_histo: StorageTimeMetrics,
+    pub compact_time_histo: StorageTimeMetrics,
+    pub create_images_time_histo: StorageTimeMetrics,
+    pub init_logical_size_histo: StorageTimeMetrics,
+    pub logical_size_histo: StorageTimeMetrics,
+    pub load_layer_map_histo: StorageTimeMetrics,
+    pub garbage_collect_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
     pub wait_lsn_time_histo: Histogram,
     pub resident_physical_size_gauge: UIntGauge,
@@ -406,24 +507,16 @@ impl TimelineMetrics {
         let materialized_page_cache_hit_counter = MATERIALIZED_PAGE_CACHE_HIT
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
-        let flush_time_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["layer flush", &tenant_id, &timeline_id])
-            .unwrap();
-        let compact_time_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["compact", &tenant_id, &timeline_id])
-            .unwrap();
-        let create_images_time_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["create images", &tenant_id, &timeline_id])
-            .unwrap();
-        let init_logical_size_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["init logical size", &tenant_id, &timeline_id])
-            .unwrap();
-        let logical_size_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["logical size", &tenant_id, &timeline_id])
-            .unwrap();
-        let load_layer_map_histo = STORAGE_TIME
-            .get_metric_with_label_values(&["load layer map", &tenant_id, &timeline_id])
-            .unwrap();
+        let flush_time_histo = StorageTimeMetrics::new("layer flush", &tenant_id, &timeline_id);
+        let compact_time_histo = StorageTimeMetrics::new("compact", &tenant_id, &timeline_id);
+        let create_images_time_histo =
+            StorageTimeMetrics::new("create images", &tenant_id, &timeline_id);
+        let init_logical_size_histo =
+            StorageTimeMetrics::new("init logical size", &tenant_id, &timeline_id);
+        let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id);
+        let load_layer_map_histo =
+            StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id);
+        let garbage_collect_histo = StorageTimeMetrics::new("gc", &tenant_id, &timeline_id);
         let last_record_gauge = LAST_RECORD_LSN
             .get_metric_with_label_values(&[&tenant_id, &timeline_id])
             .unwrap();
@@ -453,6 +546,7 @@ impl TimelineMetrics {
             create_images_time_histo,
             init_logical_size_histo,
             logical_size_histo,
+            garbage_collect_histo,
             load_layer_map_histo,
             last_record_gauge,
             wait_lsn_time_histo,
@@ -478,7 +572,10 @@ impl Drop for TimelineMetrics {
         let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
 
         for op in STORAGE_TIME_OPERATIONS {
-            let _ = STORAGE_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
+            let _ =
+                STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
+            let _ =
+                STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
         }
         for op in STORAGE_IO_TIME_OPERATIONS {
             let _ = STORAGE_IO_TIME.remove_label_values(&[op, tenant_id, timeline_id]);
@@ -495,7 +592,10 @@ impl Drop for TimelineMetrics {
 }
 
 pub fn remove_tenant_metrics(tenant_id: &TenantId) {
-    let _ = STORAGE_TIME.remove_label_values(&["gc", &tenant_id.to_string(), "-"]);
+    let tid = tenant_id.to_string();
+    for state in TENANT_STATE_OPTIONS {
+        let _ = TENANT_STATE_METRIC.remove_label_values(&[&tid, state]);
+    }
 }
 
 use futures::Future;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 344a8d1c00..878928ae06 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -13,6 +13,7 @@ use anyhow::Context;
 use bytes::Buf;
 use bytes::Bytes;
 use futures::{Stream, StreamExt};
+use pageserver_api::models::TenantState;
 use pageserver_api::models::{
     PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
     PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
@@ -30,19 +31,19 @@ use std::sync::Arc;
 use std::time::Duration;
 use tracing::*;
 use utils::id::ConnectionId;
-use utils::postgres_backend_async::QueryError;
 use utils::{
     auth::{Claims, JwtAuth, Scope},
     id::{TenantId, TimelineId},
     lsn::Lsn,
     postgres_backend::AuthType,
-    postgres_backend_async::{self, PostgresBackend},
+    postgres_backend_async::{self, is_expected_io_error, PostgresBackend, QueryError},
     simple_rcu::RcuReadGuard,
 };
 
 use crate::auth::check_permission;
 use crate::basebackup;
 use crate::config::PageServerConf;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics::{LIVE_CONNECTIONS_COUNT, SMGR_QUERY_TIME};
 use crate::task_mgr;
@@ -123,6 +124,7 @@ pub async fn libpq_listener_main(
     auth: Option<Arc<JwtAuth>>,
     listener: TcpListener,
     auth_type: AuthType,
+    listener_ctx: RequestContext,
 ) -> anyhow::Result<()> {
     listener.set_nonblocking(true)?;
     let tokio_listener = tokio::net::TcpListener::from_std(listener)?;
@@ -146,6 +148,9 @@ pub async fn libpq_listener_main(
                 debug!("accepted connection from {}", peer_addr);
                 let local_auth = auth.clone();
 
+                let connection_ctx = listener_ctx
+                    .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download);
+
                 // PageRequestHandler tasks are not associated with any particular
                 // timeline in the task manager. In practice most connections will
                 // only deal with a particular timeline, but we don't know which one
@@ -157,7 +162,7 @@ pub async fn libpq_listener_main(
                     None,
                     "serving compute connection task",
                     false,
-                    page_service_conn_main(conf, local_auth, socket, auth_type),
+                    page_service_conn_main(conf, local_auth, socket, auth_type, connection_ctx),
                 );
             }
             Err(err) => {
@@ -177,6 +182,7 @@ async fn page_service_conn_main(
     auth: Option<Arc<JwtAuth>>,
     socket: tokio::net::TcpStream,
     auth_type: AuthType,
+    connection_ctx: RequestContext,
 ) -> anyhow::Result<()> {
     // Immediately increment the gauge, then create a job to decrement it on task exit.
     // One of the pros of `defer!` is that this will *most probably*
@@ -191,24 +197,24 @@ async fn page_service_conn_main(
         .set_nodelay(true)
         .context("could not set TCP_NODELAY")?;
 
-    let mut conn_handler = PageServerHandler::new(conf, auth);
+    // XXX: pgbackend.run() should take the connection_ctx,
+    // and create a child per-query context when it invokes process_query.
+    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
+    // and create the per-query context in process_query ourselves.
+    let mut conn_handler = PageServerHandler::new(conf, auth, connection_ctx);
     let pgbackend = PostgresBackend::new(socket, auth_type, None)?;
 
-    let result = pgbackend
+    match pgbackend
         .run(&mut conn_handler, task_mgr::shutdown_watcher)
-        .await;
-    match result {
+        .await
+    {
         Ok(()) => {
             // we've been requested to shut down
             Ok(())
         }
         Err(QueryError::Disconnected(ConnectionError::Socket(io_error))) => {
-            // `ConnectionReset` error happens when the Postgres client closes the connection.
-            // As this disconnection happens quite often and is expected,
-            // we decided to downgrade the logging level to `INFO`.
-            // See: https://github.com/neondatabase/neon/issues/1683.
-            if io_error.kind() == io::ErrorKind::ConnectionReset {
-                info!("Postgres client disconnected");
+            if is_expected_io_error(&io_error) {
+                info!("Postgres client disconnected ({io_error})");
                 Ok(())
             } else {
                 Err(io_error).context("Postgres connection error")
@@ -255,30 +261,42 @@ struct PageServerHandler {
     _conf: &'static PageServerConf,
     auth: Option<Arc<JwtAuth>>,
     claims: Option<Claims>,
+
+    /// The context created for the lifetime of the connection
+    /// services by this PageServerHandler.
+    /// For each query received over the connection,
+    /// `process_query` creates a child context from this one.
+    connection_ctx: RequestContext,
 }
 
 impl PageServerHandler {
-    pub fn new(conf: &'static PageServerConf, auth: Option<Arc<JwtAuth>>) -> Self {
+    pub fn new(
+        conf: &'static PageServerConf,
+        auth: Option<Arc<JwtAuth>>,
+        connection_ctx: RequestContext,
+    ) -> Self {
         PageServerHandler {
             _conf: conf,
             auth,
             claims: None,
+            connection_ctx,
         }
     }
 
-    #[instrument(skip(self, pgb))]
+    #[instrument(skip(self, pgb, ctx))]
     async fn handle_pagerequests(
         &self,
         pgb: &mut PostgresBackend,
         tenant_id: TenantId,
         timeline_id: TimelineId,
+        ctx: RequestContext,
     ) -> anyhow::Result<()> {
         // NOTE: pagerequests handler exits when connection is closed,
         //       so there is no need to reset the association
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
 
         // Make request tracer if needed
-        let tenant = get_active_tenant_with_timeout(tenant_id).await?;
+        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
         let mut tracer = if tenant.get_trace_read_requests() {
             let connection_id = ConnectionId::generate();
             let path = tenant
@@ -329,22 +347,27 @@ impl PageServerHandler {
 
             let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
 
+            // TODO: We could create a new per-request context here, with unique ID.
+            // Currently we use the same per-timeline context for all requests
+
             let response = match neon_fe_msg {
                 PagestreamFeMessage::Exists(req) => {
                     let _timer = metrics.get_rel_exists.start_timer();
-                    self.handle_get_rel_exists_request(&timeline, &req).await
+                    self.handle_get_rel_exists_request(&timeline, &req, &ctx)
+                        .await
                 }
                 PagestreamFeMessage::Nblocks(req) => {
                     let _timer = metrics.get_rel_size.start_timer();
-                    self.handle_get_nblocks_request(&timeline, &req).await
+                    self.handle_get_nblocks_request(&timeline, &req, &ctx).await
                 }
                 PagestreamFeMessage::GetPage(req) => {
                     let _timer = metrics.get_page_at_lsn.start_timer();
-                    self.handle_get_page_at_lsn_request(&timeline, &req).await
+                    self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
+                        .await
                 }
                 PagestreamFeMessage::DbSize(req) => {
                     let _timer = metrics.get_db_size.start_timer();
-                    self.handle_db_size_request(&timeline, &req).await
+                    self.handle_db_size_request(&timeline, &req, &ctx).await
                 }
             };
 
@@ -363,7 +386,8 @@ impl PageServerHandler {
         Ok(())
     }
 
-    #[instrument(skip(self, pgb))]
+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip(self, pgb, ctx))]
     async fn handle_import_basebackup(
         &self,
         pgb: &mut PostgresBackend,
@@ -372,12 +396,13 @@ impl PageServerHandler {
         base_lsn: Lsn,
         _end_lsn: Lsn,
         pg_version: u32,
+        ctx: RequestContext,
     ) -> Result<(), QueryError> {
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
         // Create empty timeline
         info!("creating new timeline");
-        let tenant = get_active_tenant_with_timeout(tenant_id).await?;
-        let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version)?;
+        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
+        let timeline = tenant.create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)?;
 
         // TODO mark timeline as not ready until it reaches end_lsn.
         // We might have some wal to import as well, and we should prevent compute
@@ -396,7 +421,7 @@ impl PageServerHandler {
 
         let mut copyin_stream = Box::pin(copyin_stream(pgb));
         timeline
-            .import_basebackup_from_tar(&mut copyin_stream, base_lsn)
+            .import_basebackup_from_tar(&mut copyin_stream, base_lsn, &ctx)
             .await?;
 
         // Drain the rest of the Copy data
@@ -418,7 +443,7 @@ impl PageServerHandler {
         Ok(())
     }
 
-    #[instrument(skip(self, pgb))]
+    #[instrument(skip(self, pgb, ctx))]
     async fn handle_import_wal(
         &self,
         pgb: &mut PostgresBackend,
@@ -426,10 +451,11 @@ impl PageServerHandler {
         timeline_id: TimelineId,
         start_lsn: Lsn,
         end_lsn: Lsn,
+        ctx: RequestContext,
     ) -> Result<(), QueryError> {
         task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
 
-        let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
+        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
         let last_record_lsn = timeline.get_last_record_lsn();
         if last_record_lsn != start_lsn {
             return Err(QueryError::Other(
@@ -446,7 +472,7 @@ impl PageServerHandler {
         pgb.flush().await?;
         let mut copyin_stream = Box::pin(copyin_stream(pgb));
         let mut reader = tokio_util::io::StreamReader::new(&mut copyin_stream);
-        import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn).await?;
+        import_wal_from_tar(&timeline, &mut reader, start_lsn, end_lsn, &ctx).await?;
         info!("wal import complete");
 
         // Drain the rest of the Copy data
@@ -492,6 +518,7 @@ impl PageServerHandler {
         mut lsn: Lsn,
         latest: bool,
         latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Lsn> {
         if latest {
             // Latest page version was requested. If LSN is given, it is a hint
@@ -515,7 +542,7 @@ impl PageServerHandler {
             if lsn <= last_record_lsn {
                 lsn = last_record_lsn;
             } else {
-                timeline.wait_lsn(lsn).await?;
+                timeline.wait_lsn(lsn, ctx).await?;
                 // Since we waited for 'lsn' to arrive, that is now the last
                 // record LSN. (Or close enough for our purposes; the
                 // last-record LSN can advance immediately after we return
@@ -525,7 +552,7 @@ impl PageServerHandler {
             if lsn == Lsn(0) {
                 anyhow::bail!("invalid LSN(0) in request");
             }
-            timeline.wait_lsn(lsn).await?;
+            timeline.wait_lsn(lsn, ctx).await?;
         }
         anyhow::ensure!(
             lsn >= **latest_gc_cutoff_lsn,
@@ -535,52 +562,60 @@ impl PageServerHandler {
         Ok(lsn)
     }
 
-    #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
+    #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))]
     async fn handle_get_rel_exists_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamExistsRequest,
+        ctx: &RequestContext,
     ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
-            .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
 
-        let exists = timeline.get_rel_exists(req.rel, lsn, req.latest).await?;
+        let exists = timeline
+            .get_rel_exists(req.rel, lsn, req.latest, ctx)
+            .await?;
 
         Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
             exists,
         }))
     }
 
-    #[instrument(skip(self, timeline, req), fields(rel = %req.rel, req_lsn = %req.lsn))]
+    #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))]
     async fn handle_get_nblocks_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamNblocksRequest,
+        ctx: &RequestContext,
     ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
-            .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
 
-        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest).await?;
+        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;
 
         Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
             n_blocks,
         }))
     }
 
-    #[instrument(skip(self, timeline, req), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
+    #[instrument(skip(self, timeline, req, ctx), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
     async fn handle_db_size_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamDbSizeRequest,
+        ctx: &RequestContext,
     ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
-            .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
 
         let total_blocks = timeline
-            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest)
+            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
             .await?;
         let db_size = total_blocks as i64 * BLCKSZ as i64;
 
@@ -589,15 +624,17 @@ impl PageServerHandler {
         }))
     }
 
-    #[instrument(skip(self, timeline, req), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
+    #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
     async fn handle_get_page_at_lsn_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamGetPageRequest,
+        ctx: &RequestContext,
     ) -> anyhow::Result<PagestreamBeMessage> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn = Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn)
-            .await?;
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
         /*
         // Add a 1s delay to some requests. The delay helps the requests to
         // hit the race condition from github issue #1047 more easily.
@@ -608,7 +645,7 @@ impl PageServerHandler {
         */
 
         let page = timeline
-            .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest)
+            .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
             .await?;
 
         Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
@@ -616,23 +653,25 @@ impl PageServerHandler {
         }))
     }
 
-    #[instrument(skip(self, pgb))]
+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip(self, pgb, ctx))]
     async fn handle_basebackup_request(
-        &self,
+        &mut self,
         pgb: &mut PostgresBackend,
         tenant_id: TenantId,
         timeline_id: TimelineId,
         lsn: Option<Lsn>,
         prev_lsn: Option<Lsn>,
         full_backup: bool,
+        ctx: RequestContext,
     ) -> anyhow::Result<()> {
         // check that the timeline exists
-        let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
+        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         if let Some(lsn) = lsn {
             // Backup was requested at a particular LSN. Wait for it to arrive.
             info!("waiting for {}", lsn);
-            timeline.wait_lsn(lsn).await?;
+            timeline.wait_lsn(lsn, &ctx).await?;
             timeline
                 .check_lsn_is_in_scope(lsn, &latest_gc_cutoff_lsn)
                 .context("invalid basebackup lsn")?;
@@ -645,8 +684,15 @@ impl PageServerHandler {
         // Send a tarball of the latest layer on the timeline
         {
             let mut writer = pgb.copyout_writer();
-            basebackup::send_basebackup_tarball(&mut writer, &timeline, lsn, prev_lsn, full_backup)
-                .await?;
+            basebackup::send_basebackup_tarball(
+                &mut writer,
+                &timeline,
+                lsn,
+                prev_lsn,
+                full_backup,
+                &ctx,
+            )
+            .await?;
         }
 
         pgb.write_message(&BeMessage::CopyDone)?;
@@ -717,6 +763,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
         pgb: &mut PostgresBackend,
         query_string: &str,
     ) -> Result<(), QueryError> {
+        let ctx = self.connection_ctx.attached_child();
         debug!("process query {query_string:?}");
 
         if query_string.starts_with("pagestream ") {
@@ -734,7 +781,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
 
             self.check_permission(Some(tenant_id))?;
 
-            self.handle_pagerequests(pgb, tenant_id, timeline_id)
+            self.handle_pagerequests(pgb, tenant_id, timeline_id, ctx)
                 .await?;
         } else if query_string.starts_with("basebackup ") {
             let (_, params_raw) = query_string.split_at("basebackup ".len());
@@ -763,7 +810,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
             };
 
             // Check that the timeline exists
-            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false)
+            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, None, false, ctx)
                 .await?;
             pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
         }
@@ -784,7 +831,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
                 .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
 
             self.check_permission(Some(tenant_id))?;
-            let timeline = get_active_timeline_with_timeout(tenant_id, timeline_id).await?;
+            let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
 
             let end_of_timeline = timeline.get_last_record_rlsn();
 
@@ -835,7 +882,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
             self.check_permission(Some(tenant_id))?;
 
             // Check that the timeline exists
-            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true)
+            self.handle_basebackup_request(pgb, tenant_id, timeline_id, lsn, prev_lsn, true, ctx)
                 .await?;
             pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?;
         } else if query_string.starts_with("import basebackup ") {
@@ -878,6 +925,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
                     base_lsn,
                     end_lsn,
                     pg_version,
+                    ctx,
                 )
                 .await
             {
@@ -914,7 +962,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
             self.check_permission(Some(tenant_id))?;
 
             match self
-                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn)
+                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
                 .await
             {
                 Ok(()) => pgb.write_message(&BeMessage::CommandComplete(b"SELECT 1"))?,
@@ -944,7 +992,7 @@ impl postgres_backend_async::Handler for PageServerHandler {
 
             self.check_permission(Some(tenant_id))?;
 
-            let tenant = get_active_tenant_with_timeout(tenant_id).await?;
+            let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
             pgb.write_message(&BeMessage::RowDescription(&[
                 RowDescriptor::int8_col(b"checkpoint_distance"),
                 RowDescriptor::int8_col(b"checkpoint_timeout"),
@@ -990,27 +1038,66 @@ impl postgres_backend_async::Handler for PageServerHandler {
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+enum GetActiveTenantError {
+    #[error(
+        "Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}"
+    )]
+    WaitForActiveTimeout {
+        latest_state: TenantState,
+        wait_time: Duration,
+    },
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl From<GetActiveTenantError> for QueryError {
+    fn from(e: GetActiveTenantError) -> Self {
+        match e {
+            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
+                ConnectionError::Socket(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
+            ),
+            GetActiveTenantError::Other(e) => QueryError::Other(e),
+        }
+    }
+}
+
 /// Get active tenant.
 ///
 /// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
 /// ensures that queries don't fail immediately after pageserver startup, because
 /// all tenants are still loading.
-async fn get_active_tenant_with_timeout(tenant_id: TenantId) -> anyhow::Result<Arc<Tenant>> {
+async fn get_active_tenant_with_timeout(
+    tenant_id: TenantId,
+    _ctx: &RequestContext, /* require get a context to support cancellation in the future */
+) -> Result<Arc<Tenant>, GetActiveTenantError> {
     let tenant = mgr::get_tenant(tenant_id, false).await?;
-    match tokio::time::timeout(Duration::from_secs(30), tenant.wait_to_become_active()).await {
-        Ok(wait_result) => wait_result
-            // no .context(), the error message is good enough and some tests depend on it
-            .map(move |()| tenant),
-        Err(_) => anyhow::bail!("Timeout waiting for tenant {tenant_id} to become Active"),
+    let wait_time = Duration::from_secs(30);
+    match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
+        Ok(Ok(())) => Ok(tenant),
+        // no .context(), the error message is good enough and some tests depend on it
+        Ok(Err(wait_error)) => Err(GetActiveTenantError::Other(wait_error)),
+        Err(_) => {
+            let latest_state = tenant.current_state();
+            if latest_state == TenantState::Active {
+                Ok(tenant)
+            } else {
+                Err(GetActiveTenantError::WaitForActiveTimeout {
+                    latest_state,
+                    wait_time,
+                })
+            }
+        }
     }
 }
 
 /// Shorthand for getting a reference to a Timeline of an Active tenant.
-async fn get_active_timeline_with_timeout(
+async fn get_active_tenant_timeline(
     tenant_id: TenantId,
     timeline_id: TimelineId,
-) -> anyhow::Result<Arc<Timeline>> {
-    get_active_tenant_with_timeout(tenant_id)
-        .await
-        .and_then(|tenant| tenant.get_timeline(timeline_id, true))
+    ctx: &RequestContext,
+) -> Result<Arc<Timeline>, GetActiveTenantError> {
+    let tenant = get_active_tenant_with_timeout(tenant_id, ctx).await?;
+    let timeline = tenant.get_timeline(timeline_id, true)?;
+    Ok(timeline)
 }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index cc521c5e35..6f9035305d 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -7,6 +7,7 @@
 //! Clarify that)
 //!
 use super::tenant::{PageReconstructError, Timeline};
+use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
@@ -97,6 +98,7 @@ impl Timeline {
         blknum: BlockNumber,
         lsn: Lsn,
         latest: bool,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         if tag.relnode == 0 {
             return Err(PageReconstructError::Other(anyhow::anyhow!(
@@ -104,7 +106,7 @@ impl Timeline {
             )));
         }
 
-        let nblocks = self.get_rel_size(tag, lsn, latest).await?;
+        let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
         if blknum >= nblocks {
             debug!(
                 "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
@@ -114,7 +116,7 @@ impl Timeline {
         }
 
         let key = rel_block_to_key(tag, blknum);
-        self.get(key, lsn).await
+        self.get(key, lsn, ctx).await
     }
 
     // Get size of a database in blocks
@@ -124,13 +126,14 @@ impl Timeline {
         dbnode: Oid,
         lsn: Lsn,
         latest: bool,
+        ctx: &RequestContext,
     ) -> Result<usize, PageReconstructError> {
         let mut total_blocks = 0;
 
-        let rels = self.list_rels(spcnode, dbnode, lsn).await?;
+        let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;
 
         for rel in rels {
-            let n_blocks = self.get_rel_size(rel, lsn, latest).await?;
+            let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
             total_blocks += n_blocks as usize;
         }
         Ok(total_blocks)
@@ -142,6 +145,7 @@ impl Timeline {
         tag: RelTag,
         lsn: Lsn,
         latest: bool,
+        ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
         if tag.relnode == 0 {
             return Err(PageReconstructError::Other(anyhow::anyhow!(
@@ -154,7 +158,7 @@ impl Timeline {
         }
 
         if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, lsn, latest).await?
+            && !self.get_rel_exists(tag, lsn, latest, ctx).await?
         {
             // FIXME: Postgres sometimes calls smgrcreate() to create
             // FSM, and smgrnblocks() on it immediately afterwards,
@@ -164,7 +168,7 @@ impl Timeline {
         }
 
         let key = rel_size_to_key(tag);
-        let mut buf = self.get(key, lsn).await?;
+        let mut buf = self.get(key, lsn, ctx).await?;
         let nblocks = buf.get_u32_le();
 
         if latest {
@@ -186,6 +190,7 @@ impl Timeline {
         tag: RelTag,
         lsn: Lsn,
         _latest: bool,
+        ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
         if tag.relnode == 0 {
             return Err(PageReconstructError::Other(anyhow::anyhow!(
@@ -199,7 +204,7 @@ impl Timeline {
         }
         // fetch directory listing
         let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
 
         match RelDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -216,10 +221,11 @@ impl Timeline {
         spcnode: Oid,
         dbnode: Oid,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<HashSet<RelTag>, PageReconstructError> {
         // fetch directory listing
         let key = rel_dir_to_key(spcnode, dbnode);
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
 
         match RelDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -244,9 +250,10 @@ impl Timeline {
         segno: u32,
         blknum: BlockNumber,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         let key = slru_block_to_key(kind, segno, blknum);
-        self.get(key, lsn).await
+        self.get(key, lsn, ctx).await
     }
 
     /// Get size of an SLRU segment
@@ -255,9 +262,10 @@ impl Timeline {
         kind: SlruKind,
         segno: u32,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
         let key = slru_segment_size_to_key(kind, segno);
-        let mut buf = self.get(key, lsn).await?;
+        let mut buf = self.get(key, lsn, ctx).await?;
         Ok(buf.get_u32_le())
     }
 
@@ -267,10 +275,11 @@ impl Timeline {
         kind: SlruKind,
         segno: u32,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
         // fetch directory listing
         let key = slru_dir_to_key(kind);
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
 
         match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -291,6 +300,7 @@ impl Timeline {
     pub async fn find_lsn_for_timestamp(
         &self,
         search_timestamp: TimestampTz,
+        ctx: &RequestContext,
     ) -> Result<LsnForTimestamp, PageReconstructError> {
         let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
         let min_lsn = *gc_cutoff_lsn_guard;
@@ -313,6 +323,7 @@ impl Timeline {
                     Lsn(mid * 8),
                     &mut found_smaller,
                     &mut found_larger,
+                    ctx,
                 )
                 .await?;
 
@@ -362,14 +373,18 @@ impl Timeline {
         probe_lsn: Lsn,
         found_smaller: &mut bool,
         found_larger: &mut bool,
+        ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
-        for segno in self.list_slru_segments(SlruKind::Clog, probe_lsn).await? {
+        for segno in self
+            .list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
+            .await?
+        {
             let nblocks = self
-                .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn)
+                .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
                 .await?;
             for blknum in (0..nblocks).rev() {
                 let clog_page = self
-                    .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn)
+                    .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx)
                     .await?;
 
                 if clog_page.len() == BLCKSZ as usize + 8 {
@@ -394,11 +409,12 @@ impl Timeline {
         &self,
         kind: SlruKind,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<HashSet<u32>, PageReconstructError> {
         // fetch directory entry
         let key = slru_dir_to_key(kind);
 
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
         match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => Ok(dir.segments),
             Err(e) => Err(PageReconstructError::from(e)),
@@ -410,18 +426,21 @@ impl Timeline {
         spcnode: Oid,
         dbnode: Oid,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         let key = relmap_file_key(spcnode, dbnode);
 
-        self.get(key, lsn).await
+        let buf = self.get(key, lsn, ctx).await?;
+        Ok(buf)
     }
 
     pub async fn list_dbdirs(
         &self,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<HashMap<(Oid, Oid), bool>, PageReconstructError> {
         // fetch directory entry
-        let buf = self.get(DBDIR_KEY, lsn).await?;
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
 
         match DbDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => Ok(dir.dbdirs),
@@ -433,18 +452,20 @@ impl Timeline {
         &self,
         xid: TransactionId,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         let key = twophase_file_key(xid);
-        let buf = self.get(key, lsn).await?;
+        let buf = self.get(key, lsn, ctx).await?;
         Ok(buf)
     }
 
     pub async fn list_twophase_files(
         &self,
         lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<HashSet<TransactionId>, PageReconstructError> {
         // fetch directory entry
-        let buf = self.get(TWOPHASEDIR_KEY, lsn).await?;
+        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
 
         match TwoPhaseDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => Ok(dir.xids),
@@ -452,12 +473,20 @@ impl Timeline {
         }
     }
 
-    pub async fn get_control_file(&self, lsn: Lsn) -> Result<Bytes, PageReconstructError> {
-        self.get(CONTROLFILE_KEY, lsn).await
+    pub async fn get_control_file(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        self.get(CONTROLFILE_KEY, lsn, ctx).await
     }
 
-    pub async fn get_checkpoint(&self, lsn: Lsn) -> Result<Bytes, PageReconstructError> {
-        self.get(CHECKPOINT_KEY, lsn).await
+    pub async fn get_checkpoint(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        self.get(CHECKPOINT_KEY, lsn, ctx).await
     }
 
     /// Does the same as get_current_logical_size but counted on demand.
@@ -469,15 +498,16 @@ impl Timeline {
         &self,
         lsn: Lsn,
         cancel: CancellationToken,
+        ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
         // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn).await.context("read dbdir")?;
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
         let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
 
         let mut total_size: u64 = 0;
         for (spcnode, dbnode) in dbdir.dbdirs.keys() {
             for rel in self
-                .list_rels(*spcnode, *dbnode, lsn)
+                .list_rels(*spcnode, *dbnode, lsn, ctx)
                 .await
                 .context("list rels")?
             {
@@ -486,9 +516,9 @@ impl Timeline {
                 }
                 let relsize_key = rel_size_to_key(rel);
                 let mut buf = self
-                    .get(relsize_key, lsn)
+                    .get(relsize_key, lsn, ctx)
                     .await
-                    .context("read relation size of {rel:?}")?;
+                    .with_context(|| format!("read relation size of {rel:?}"))?;
                 let relsize = buf.get_u32_le();
 
                 total_size += relsize as u64;
@@ -501,7 +531,11 @@ impl Timeline {
     /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
     /// Anything that's not listed maybe removed from the underlying storage (from
     /// that LSN forwards).
-    pub async fn collect_keyspace(&self, lsn: Lsn) -> anyhow::Result<KeySpace> {
+    pub async fn collect_keyspace(
+        &self,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<KeySpace> {
         // Iterate through key ranges, greedily packing them into partitions
         let mut result = KeySpaceAccum::new();
 
@@ -509,7 +543,7 @@ impl Timeline {
         result.add_key(DBDIR_KEY);
 
         // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn).await?;
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
         let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
 
         let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
@@ -519,14 +553,14 @@ impl Timeline {
             result.add_key(rel_dir_to_key(spcnode, dbnode));
 
             let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, lsn)
+                .list_rels(spcnode, dbnode, lsn, ctx)
                 .await?
                 .into_iter()
                 .collect();
             rels.sort_unstable();
             for rel in rels {
                 let relsize_key = rel_size_to_key(rel);
-                let mut buf = self.get(relsize_key, lsn).await?;
+                let mut buf = self.get(relsize_key, lsn, ctx).await?;
                 let relsize = buf.get_u32_le();
 
                 result.add_range(rel_block_to_key(rel, 0)..rel_block_to_key(rel, relsize));
@@ -542,13 +576,13 @@ impl Timeline {
         ] {
             let slrudir_key = slru_dir_to_key(kind);
             result.add_key(slrudir_key);
-            let buf = self.get(slrudir_key, lsn).await?;
+            let buf = self.get(slrudir_key, lsn, ctx).await?;
             let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
             let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
             segments.sort_unstable();
             for segno in segments {
                 let segsize_key = slru_segment_size_to_key(kind, segno);
-                let mut buf = self.get(segsize_key, lsn).await?;
+                let mut buf = self.get(segsize_key, lsn, ctx).await?;
                 let segsize = buf.get_u32_le();
 
                 result.add_range(
@@ -560,7 +594,7 @@ impl Timeline {
 
         // Then pg_twophase
         result.add_key(TWOPHASEDIR_KEY);
-        let buf = self.get(TWOPHASEDIR_KEY, lsn).await?;
+        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
         let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
         let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
         xids.sort_unstable();
@@ -723,9 +757,10 @@ impl<'a> DatadirModification<'a> {
         spcnode: Oid,
         dbnode: Oid,
         img: Bytes,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Add it to the directory (if it doesn't exist already)
-        let buf = self.get(DBDIR_KEY).await?;
+        let buf = self.get(DBDIR_KEY, ctx).await?;
         let mut dbdir = DbDirectory::des(&buf)?;
 
         let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
@@ -755,9 +790,10 @@ impl<'a> DatadirModification<'a> {
         &mut self,
         xid: TransactionId,
         img: Bytes,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Add it to the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY).await?;
+        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
         let mut dir = TwoPhaseDirectory::des(&buf)?;
         if !dir.xids.insert(xid) {
             anyhow::bail!("twophase file for xid {} already exists", xid);
@@ -781,16 +817,21 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
-    pub async fn drop_dbdir(&mut self, spcnode: Oid, dbnode: Oid) -> anyhow::Result<()> {
+    pub async fn drop_dbdir(
+        &mut self,
+        spcnode: Oid,
+        dbnode: Oid,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         let req_lsn = self.tline.get_last_record_lsn();
 
         let total_blocks = self
             .tline
-            .get_db_size(spcnode, dbnode, req_lsn, true)
+            .get_db_size(spcnode, dbnode, req_lsn, true, ctx)
             .await?;
 
         // Remove entry from dbdir
-        let buf = self.get(DBDIR_KEY).await?;
+        let buf = self.get(DBDIR_KEY, ctx).await?;
         let mut dir = DbDirectory::des(&buf)?;
         if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
             let buf = DbDirectory::ser(&dir)?;
@@ -817,11 +858,12 @@ impl<'a> DatadirModification<'a> {
         &mut self,
         rel: RelTag,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         // It's possible that this is the first rel for this db in this
         // tablespace.  Create the reldir entry for it if so.
-        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY).await?)?;
+        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await?)?;
         let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
         let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
             // Didn't exist. Update dbdir
@@ -833,7 +875,7 @@ impl<'a> DatadirModification<'a> {
             RelDirectory::default()
         } else {
             // reldir already exists, fetch it
-            RelDirectory::des(&self.get(rel_dir_key).await?)?
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
         };
 
         // Add the new relation to the rel directory entry, and write it back
@@ -865,13 +907,14 @@ impl<'a> DatadirModification<'a> {
         &mut self,
         rel: RelTag,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, "invalid relnode");
         let last_lsn = self.tline.get_last_record_lsn();
-        if self.tline.get_rel_exists(rel, last_lsn, true).await? {
+        if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
             let size_key = rel_size_to_key(rel);
             // Fetch the old size first
-            let old_size = self.get(size_key).await?.get_u32_le();
+            let old_size = self.get(size_key, ctx).await?.get_u32_le();
 
             // Update the entry with the new size.
             let buf = nblocks.to_le_bytes();
@@ -895,12 +938,13 @@ impl<'a> DatadirModification<'a> {
         &mut self,
         rel: RelTag,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, "invalid relnode");
 
         // Put size
         let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key).await?.get_u32_le();
+        let old_size = self.get(size_key, ctx).await?.get_u32_le();
 
         // only extend relation here. never decrease the size
         if nblocks > old_size {
@@ -916,12 +960,12 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// Drop a relation.
-    pub async fn put_rel_drop(&mut self, rel: RelTag) -> anyhow::Result<()> {
+    pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, "invalid relnode");
 
         // Remove it from the directory entry
         let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let buf = self.get(dir_key).await?;
+        let buf = self.get(dir_key, ctx).await?;
         let mut dir = RelDirectory::des(&buf)?;
 
         if dir.rels.remove(&(rel.relnode, rel.forknum)) {
@@ -932,7 +976,7 @@ impl<'a> DatadirModification<'a> {
 
         // update logical size
         let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key).await?.get_u32_le();
+        let old_size = self.get(size_key, ctx).await?.get_u32_le();
         self.pending_nblocks -= old_size as i64;
 
         // Remove enty from relation size cache
@@ -949,10 +993,11 @@ impl<'a> DatadirModification<'a> {
         kind: SlruKind,
         segno: u32,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Add it to the directory entry
         let dir_key = slru_dir_to_key(kind);
-        let buf = self.get(dir_key).await?;
+        let buf = self.get(dir_key, ctx).await?;
         let mut dir = SlruSegmentDirectory::des(&buf)?;
 
         if !dir.segments.insert(segno) {
@@ -988,10 +1033,15 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// This method is used for marking truncated SLRU files
-    pub async fn drop_slru_segment(&mut self, kind: SlruKind, segno: u32) -> anyhow::Result<()> {
+    pub async fn drop_slru_segment(
+        &mut self,
+        kind: SlruKind,
+        segno: u32,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         // Remove it from the directory entry
         let dir_key = slru_dir_to_key(kind);
-        let buf = self.get(dir_key).await?;
+        let buf = self.get(dir_key, ctx).await?;
         let mut dir = SlruSegmentDirectory::des(&buf)?;
 
         if !dir.segments.remove(&segno) {
@@ -1015,9 +1065,13 @@ impl<'a> DatadirModification<'a> {
     }
 
     /// This method is used for marking truncated SLRU files
-    pub async fn drop_twophase_file(&mut self, xid: TransactionId) -> anyhow::Result<()> {
+    pub async fn drop_twophase_file(
+        &mut self,
+        xid: TransactionId,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         // Remove it from the directory entry
-        let buf = self.get(TWOPHASEDIR_KEY).await?;
+        let buf = self.get(TWOPHASEDIR_KEY, ctx).await?;
         let mut dir = TwoPhaseDirectory::des(&buf)?;
 
         if !dir.xids.remove(&xid) {
@@ -1111,7 +1165,7 @@ impl<'a> DatadirModification<'a> {
 
     // Internal helper functions to batch the modifications
 
-    async fn get(&self, key: Key) -> Result<Bytes, PageReconstructError> {
+    async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
         // Have we already updated the same key? Read the pending updated
         // version in that case.
         //
@@ -1132,7 +1186,7 @@ impl<'a> DatadirModification<'a> {
             }
         } else {
             let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
-            self.tline.get(key, lsn).await
+            self.tline.get(key, lsn, ctx).await
         }
     }
 
@@ -1542,10 +1596,11 @@ pub fn create_test_timeline(
     tenant: &crate::tenant::Tenant,
     timeline_id: utils::id::TimelineId,
     pg_version: u32,
+    ctx: &RequestContext,
 ) -> anyhow::Result<std::sync::Arc<Timeline>> {
     let tline = tenant
-        .create_empty_timeline(timeline_id, Lsn(8), pg_version)?
-        .initialize()?;
+        .create_empty_timeline(timeline_id, Lsn(8), pg_version, ctx)?
+        .initialize(ctx)?;
     let mut m = tline.begin_modification(Lsn(8));
     m.init_empty()?;
     m.commit()?;
@@ -1598,7 +1653,7 @@ mod tests {
         assert!(tline.list_rels(0, TESTDB, Lsn(0x30))?.contains(&TESTREL_A));
 
         // Create a branch, check that the relation is visible there
-        repo.branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Lsn(0x30))?;
+        repo.branch_timeline(&tline, NEW_TIMELINE_ID, Lsn(0x30))?;
         let newtline = match repo.get_timeline(NEW_TIMELINE_ID)?.local_timeline() {
             Some(timeline) => timeline,
             None => panic!("Should have a local timeline"),
diff --git a/pageserver/src/repository.rs b/pageserver/src/repository.rs
index 586fd20886..092503b7c5 100644
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -37,6 +37,17 @@ impl Key {
             | self.field6 as i128
     }
 
+    pub fn from_i128(x: i128) -> Self {
+        Key {
+            field1: ((x >> 120) & 0xf) as u8,
+            field2: ((x >> 104) & 0xFFFF) as u32,
+            field3: (x >> 72) as u32,
+            field4: (x >> 40) as u32,
+            field5: (x >> 32) as u8,
+            field6: x as u32,
+        }
+    }
+
     pub fn next(&self) -> Key {
         self.add(1)
     }
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 02e2e2ee14..09716ba0e0 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -171,6 +171,9 @@ task_local! {
 ///
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub enum TaskKind {
+    // Pageserver startup, i.e., `main`
+    Startup,
+
     // libpq listener task. It just accepts connection and spawns a
     // PageRequestHandler task for each connection.
     LibpqEndpointListener,
@@ -183,13 +186,37 @@ pub enum TaskKind {
     // associated with one later, after receiving a command from the client.
     PageRequestHandler,
 
-    // Manages the WAL receiver connection for one timeline. It subscribes to
-    // events from storage_broker, decides which safekeeper to connect to. It spawns a
-    // separate WalReceiverConnection task to handle each connection.
+    /// Manages the WAL receiver connection for one timeline.
+    /// It subscribes to events from storage_broker and decides which safekeeper to connect to.
+    /// Once the decision has been made, it establishes the connection using the `tokio-postgres` library.
+    /// There is at most one connection at any given time.
+    ///
+    /// That `tokio-postgres` library represents a connection as two objects: a `Client` and a `Connection`.
+    /// The `Client` object is what library users use to make requests & get responses.
+    /// Internally, `Client` hands over requests to the `Connection` object.
+    /// The `Connection` object is responsible for speaking the wire protocol.
+    ///
+    /// Walreceiver uses its own abstraction called `TaskHandle` to represent the activity of establishing and handling a connection.
+    /// That abstraction doesn't use `task_mgr`.
+    /// The [`WalReceiverManager`] task ensures that this `TaskHandle` task does not outlive the [`WalReceiverManager`] task.
+    /// For the `RequestContext` that we hand to the TaskHandle, we use the [`WalReceiverConnectionHandler`] task kind.
+    ///
+    /// Once the connection is established, the `TaskHandle` task creates a
+    /// [`WalReceiverConnectionPoller`] task_mgr task that is responsible for polling
+    /// the `Connection` object.
+    /// A `CancellationToken` created by the `TaskHandle` task ensures
+    /// that the [`WalReceiverConnectionPoller`] task will cancel soon after as the `TaskHandle` is dropped.
     WalReceiverManager,
 
-    // Handles a connection to a safekeeper, to stream WAL to a timeline.
-    WalReceiverConnection,
+    /// The `TaskHandle` task that executes [`walreceiver_connection::handle_walreceiver_connection`].
+    /// Not a `task_mgr` task, but we use this `TaskKind` for its `RequestContext`.
+    /// See the comment on [`WalReceiverManager`].
+    WalReceiverConnectionHandler,
+
+    /// The task that polls the `tokio-postgres::Connection` object.
+    /// Spawned by task [`WalReceiverConnectionHandler`].
+    /// See the comment on [`WalReceiverManager`].
+    WalReceiverConnectionPoller,
 
     // Garbage collection worker. One per tenant
     GarbageCollector,
@@ -200,6 +227,8 @@ pub enum TaskKind {
     // Initial logical size calculation
     InitialLogicalSizeCalculation,
 
+    OndemandLogicalSizeCalculation,
+
     // Task that flushes frozen in-memory layers to disk
     LayerFlushTask,
 
@@ -222,6 +251,12 @@ pub enum TaskKind {
     DownloadAllRemoteLayers,
     // Task that calculates synthetis size for all active tenants
     CalculateSyntheticSize,
+
+    // A request that comes in via the pageserver HTTP API.
+    MgmtRequest,
+
+    #[cfg(test)]
+    UnitTest,
 }
 
 #[derive(Default)]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c18c645e5b..2f45fe0dfc 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -48,9 +48,10 @@ use std::time::{Duration, Instant};
 use self::metadata::TimelineMetadata;
 use self::remote_timeline_client::RemoteTimelineClient;
 use crate::config::PageServerConf;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir;
 use crate::is_uninit_mark;
-use crate::metrics::{remove_tenant_metrics, STORAGE_TIME};
+use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC};
 use crate::repository::GcResult;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
@@ -174,7 +175,7 @@ impl UninitializedTimeline<'_> {
     ///
     /// The new timeline is initialized in Active state, and its background jobs are
     /// started
-    pub fn initialize(self) -> anyhow::Result<Arc<Timeline>> {
+    pub fn initialize(self, _ctx: &RequestContext) -> anyhow::Result<Arc<Timeline>> {
         let mut timelines = self.owning_tenant.timelines.lock().unwrap();
         self.initialize_with_lock(&mut timelines, true, true)
     }
@@ -188,7 +189,7 @@ impl UninitializedTimeline<'_> {
         mut self,
         timelines: &mut HashMap<TimelineId, Arc<Timeline>>,
         load_layer_map: bool,
-        launch_wal_receiver: bool,
+        activate: bool,
     ) -> anyhow::Result<Arc<Timeline>> {
         let timeline_id = self.timeline_id;
         let tenant_id = self.owning_tenant.tenant_id;
@@ -221,13 +222,12 @@ impl UninitializedTimeline<'_> {
                         "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
                     )
                 })?;
-                new_timeline.set_state(TimelineState::Active);
                 v.insert(Arc::clone(&new_timeline));
 
                 new_timeline.maybe_spawn_flush_loop();
 
-                if launch_wal_receiver {
-                    new_timeline.launch_wal_receiver();
+                if activate {
+                    new_timeline.activate();
                 }
             }
         }
@@ -240,11 +240,12 @@ impl UninitializedTimeline<'_> {
         self,
         copyin_stream: &mut (impl Stream<Item = io::Result<Bytes>> + Sync + Send + Unpin),
         base_lsn: Lsn,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
         let raw_timeline = self.raw_timeline()?;
 
         let mut reader = tokio_util::io::StreamReader::new(copyin_stream);
-        import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn)
+        import_datadir::import_basebackup_from_tar(raw_timeline, &mut reader, base_lsn, ctx)
             .await
             .context("Failed to import basebackup")?;
 
@@ -262,9 +263,7 @@ impl UninitializedTimeline<'_> {
             .await
             .context("Failed to flush after basebackup import")?;
 
-        let timeline = self.initialize()?;
-
-        Ok(timeline)
+        self.initialize(ctx)
     }
 
     fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
@@ -450,6 +449,7 @@ impl Tenant {
     ///
     /// If the operation fails, the timeline is left in the tenant's hash map in Broken state. On success,
     /// it is marked as Active.
+    #[allow(clippy::too_many_arguments)]
     async fn timeline_init_and_sync(
         &self,
         timeline_id: TimelineId,
@@ -458,6 +458,7 @@ impl Tenant {
         local_metadata: Option<TimelineMetadata>,
         ancestor: Option<Arc<Timeline>>,
         first_save: bool,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let tenant_id = self.tenant_id;
 
@@ -573,6 +574,7 @@ impl Tenant {
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         remote_storage: GenericRemoteStorage,
+        ctx: &RequestContext,
     ) -> Arc<Tenant> {
         // XXX: Attach should provide the config, especially during tenant migration.
         //      See https://github.com/neondatabase/neon/issues/1555
@@ -591,6 +593,7 @@ impl Tenant {
         // Do all the hard work in the background
         let tenant_clone = Arc::clone(&tenant);
 
+        let ctx = ctx.detached_child(TaskKind::Attach, DownloadBehavior::Warn);
         task_mgr::spawn(
             &tokio::runtime::Handle::current(),
             TaskKind::Attach,
@@ -599,7 +602,7 @@ impl Tenant {
             "attach tenant",
             false,
             async move {
-                match tenant_clone.attach().await {
+                match tenant_clone.attach(ctx).await {
                     Ok(_) => {}
                     Err(e) => {
                         tenant_clone.set_broken(&e.to_string());
@@ -615,8 +618,8 @@ impl Tenant {
     ///
     /// Background task that downloads all data for a tenant and brings it to Active state.
     ///
-    #[instrument(skip(self), fields(tenant_id=%self.tenant_id))]
-    async fn attach(self: &Arc<Tenant>) -> anyhow::Result<()> {
+    #[instrument(skip(self, ctx), fields(tenant_id=%self.tenant_id))]
+    async fn attach(self: &Arc<Tenant>, ctx: RequestContext) -> anyhow::Result<()> {
         // Create directory with marker file to indicate attaching state.
         // The load_local_tenants() function in tenant::mgr relies on the marker file
         // to determine whether a tenant has finished attaching.
@@ -716,6 +719,7 @@ impl Tenant {
                 index_parts.remove(&timeline_id).unwrap(),
                 remote_metadata,
                 remote_clients.remove(&timeline_id).unwrap(),
+                &ctx,
             )
             .await
             .with_context(|| {
@@ -765,6 +769,7 @@ impl Tenant {
         index_part: IndexPart,
         remote_metadata: TimelineMetadata,
         remote_client: RemoteTimelineClient,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         info!("downloading index file for timeline {}", timeline_id);
         tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id))
@@ -799,6 +804,7 @@ impl Tenant {
             local_metadata,
             ancestor,
             true,
+            ctx,
         )
         .await
     }
@@ -827,11 +833,12 @@ impl Tenant {
     /// If the loading fails for some reason, the Tenant will go into Broken
     /// state.
     ///
-    #[instrument(skip(conf, remote_storage), fields(tenant_id=%tenant_id))]
+    #[instrument(skip(conf, remote_storage, ctx), fields(tenant_id=%tenant_id))]
     pub fn spawn_load(
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         remote_storage: Option<GenericRemoteStorage>,
+        ctx: &RequestContext,
     ) -> Arc<Tenant> {
         let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
             Ok(conf) => conf,
@@ -855,6 +862,7 @@ impl Tenant {
         // Do all the hard work in a background task
         let tenant_clone = Arc::clone(&tenant);
 
+        let ctx = ctx.detached_child(TaskKind::InitialLoad, DownloadBehavior::Warn);
         let _ = task_mgr::spawn(
             &tokio::runtime::Handle::current(),
             TaskKind::InitialLoad,
@@ -863,7 +871,7 @@ impl Tenant {
             "initial tenant load",
             false,
             async move {
-                match tenant_clone.load().await {
+                match tenant_clone.load(&ctx).await {
                     Ok(()) => {}
                     Err(err) => {
                         tenant_clone.set_broken(&err.to_string());
@@ -884,8 +892,8 @@ impl Tenant {
     /// Background task to load in-memory data structures for this tenant, from
     /// files on disk. Used at pageserver startup.
     ///
-    #[instrument(skip(self), fields(tenant_id=%self.tenant_id))]
-    async fn load(self: &Arc<Tenant>) -> anyhow::Result<()> {
+    #[instrument(skip(self, ctx), fields(tenant_id=%self.tenant_id))]
+    async fn load(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
         info!("loading tenant task");
 
         utils::failpoint_sleep_millis_async!("before-loading-tenant");
@@ -996,7 +1004,7 @@ impl Tenant {
         //    1. "Timeline has no ancestor and no layer files"
 
         for (timeline_id, local_metadata) in sorted_timelines {
-            self.load_local_timeline(timeline_id, local_metadata)
+            self.load_local_timeline(timeline_id, local_metadata, ctx)
                 .await
                 .with_context(|| format!("load local timeline {timeline_id}"))?;
         }
@@ -1013,11 +1021,12 @@ impl Tenant {
     /// Subroutine of `load_tenant`, to load an individual timeline
     ///
     /// NB: The parent is assumed to be already loaded!
-    #[instrument(skip(self, local_metadata), fields(timeline_id=%timeline_id))]
+    #[instrument(skip(self, local_metadata, ctx), fields(timeline_id=%timeline_id))]
     async fn load_local_timeline(
         &self,
         timeline_id: TimelineId,
         local_metadata: TimelineMetadata,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
             let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
@@ -1061,6 +1070,7 @@ impl Tenant {
             Some(local_metadata),
             ancestor,
             false,
+            ctx,
         )
         .await
     }
@@ -1112,6 +1122,7 @@ impl Tenant {
         new_timeline_id: TimelineId,
         initdb_lsn: Lsn,
         pg_version: u32,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<UninitializedTimeline> {
         anyhow::ensure!(
             self.is_active(),
@@ -1153,6 +1164,7 @@ impl Tenant {
         ancestor_timeline_id: Option<TimelineId>,
         mut ancestor_start_lsn: Option<Lsn>,
         pg_version: u32,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Option<Arc<Timeline>>> {
         anyhow::ensure!(
             self.is_active(),
@@ -1190,13 +1202,16 @@ impl Tenant {
                     // decoding the new WAL might need to look up previous pages, relation
                     // sizes etc. and that would get confused if the previous page versions
                     // are not in the repository yet.
-                    ancestor_timeline.wait_lsn(*lsn).await?;
+                    ancestor_timeline.wait_lsn(*lsn, ctx).await?;
                 }
 
-                self.branch_timeline(ancestor_timeline_id, new_timeline_id, ancestor_start_lsn)
+                self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx)
+                    .await?
+            }
+            None => {
+                self.bootstrap_timeline(new_timeline_id, pg_version, ctx)
                     .await?
             }
-            None => self.bootstrap_timeline(new_timeline_id, pg_version).await?,
         };
 
         Ok(Some(loaded_timeline))
@@ -1220,30 +1235,25 @@ impl Tenant {
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
+        ctx: &RequestContext,
     ) -> anyhow::Result<GcResult> {
         anyhow::ensure!(
             self.is_active(),
             "Cannot run GC iteration on inactive tenant"
         );
 
-        let timeline_str = target_timeline_id
-            .map(|x| x.to_string())
-            .unwrap_or_else(|| "-".to_string());
+        let gc_result = self
+            .gc_iteration_internal(target_timeline_id, horizon, pitr, ctx)
+            .await;
 
-        {
-            let _timer = STORAGE_TIME
-                .with_label_values(&["gc", &self.tenant_id.to_string(), &timeline_str])
-                .start_timer();
-            self.gc_iteration_internal(target_timeline_id, horizon, pitr)
-                .await
-        }
+        gc_result
     }
 
     /// Perform one compaction iteration.
     /// This function is periodically called by compactor task.
     /// Also it can be explicitly requested per timeline through page server
     /// api's 'compact' command.
-    pub async fn compaction_iteration(&self) -> anyhow::Result<()> {
+    pub async fn compaction_iteration(&self, ctx: &RequestContext) -> anyhow::Result<()> {
         anyhow::ensure!(
             self.is_active(),
             "Cannot run compaction iteration on inactive tenant"
@@ -1265,7 +1275,7 @@ impl Tenant {
 
         for (timeline_id, timeline) in &timelines_to_compact {
             timeline
-                .compact()
+                .compact(ctx)
                 .instrument(info_span!("compact_timeline", timeline = %timeline_id))
                 .await?;
         }
@@ -1298,7 +1308,11 @@ impl Tenant {
     }
 
     /// Removes timeline-related in-memory data
-    pub async fn delete_timeline(&self, timeline_id: TimelineId) -> anyhow::Result<()> {
+    pub async fn delete_timeline(
+        &self,
+        timeline_id: TimelineId,
+        _ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         // Transition the timeline into TimelineState::Stopping.
         // This should prevent new operations from starting.
         let timeline = {
@@ -1462,8 +1476,7 @@ impl Tenant {
                     tasks::start_background_loops(self.tenant_id);
 
                     for timeline in not_broken_timelines {
-                        timeline.set_state(TimelineState::Active);
-                        timeline.launch_wal_receiver();
+                        timeline.activate();
                     }
                 }
             }
@@ -1487,7 +1500,7 @@ impl Tenant {
                         .values()
                         .filter(|timeline| timeline.current_state() != TimelineState::Broken);
                     for timeline in not_broken_timelines {
-                        timeline.set_state(TimelineState::Suspended);
+                        timeline.set_state(TimelineState::Stopping);
                     }
                 }
                 TenantState::Broken => {
@@ -1717,7 +1730,33 @@ impl Tenant {
         tenant_id: TenantId,
         remote_storage: Option<GenericRemoteStorage>,
     ) -> Tenant {
-        let (state, _) = watch::channel(state);
+        let (state, mut rx) = watch::channel(state);
+
+        tokio::spawn(async move {
+            let current_state = *rx.borrow_and_update();
+            let tid = tenant_id.to_string();
+            TENANT_STATE_METRIC
+                .with_label_values(&[&tid, current_state.as_str()])
+                .inc();
+            loop {
+                match rx.changed().await {
+                    Ok(()) => {
+                        let new_state = *rx.borrow();
+                        TENANT_STATE_METRIC
+                            .with_label_values(&[&tid, current_state.as_str()])
+                            .dec();
+                        TENANT_STATE_METRIC
+                            .with_label_values(&[&tid, new_state.as_str()])
+                            .inc();
+                    }
+                    Err(_sender_dropped_error) => {
+                        info!("Tenant dropped the state updates sender, quitting waiting for tenant state change");
+                        return;
+                    }
+                }
+            }
+        });
+
         Tenant {
             tenant_id,
             conf,
@@ -1776,69 +1815,70 @@ impl Tenant {
     }
 
     pub(super) fn persist_tenant_config(
+        tenant_id: &TenantId,
         target_config_path: &Path,
         tenant_conf: TenantConfOpt,
-        first_save: bool,
+        creating_tenant: bool,
     ) -> anyhow::Result<()> {
         let _enter = info_span!("saving tenantconf").entered();
-        info!("persisting tenantconf to {}", target_config_path.display());
 
-        // TODO this will prepend comments endlessly ?
-        let mut conf_content = r#"# This file contains a specific per-tenant's config.
-#  It is read in case of pageserver restart.
-
-[tenant_config]
-"#
-        .to_string();
-
-        // Convert the config to a toml file.
-        conf_content += &toml_edit::easy::to_string(&tenant_conf)?;
-
-        let mut target_config_file = VirtualFile::open_with_options(
-            target_config_path,
-            OpenOptions::new()
-                .truncate(true) // This needed for overwriting with small config files
-                .write(true)
-                .create_new(first_save),
-        )?;
-
-        target_config_file
-            .write(conf_content.as_bytes())
-            .context("Failed to write toml bytes into file")
-            .and_then(|_| {
-                target_config_file
-                    .sync_all()
-                    .context("Faile to fsync config file")
-            })
-            .with_context(|| {
+        // imitate a try-block with a closure
+        let do_persist = |target_config_path: &Path| -> anyhow::Result<()> {
+            let target_config_parent = target_config_path.parent().with_context(|| {
                 format!(
-                    "Failed to write config file into path '{}'",
+                    "Config path does not have a parent: {}",
                     target_config_path.display()
                 )
             })?;
 
-        // fsync the parent directory to ensure the directory entry is durable
-        if first_save {
-            target_config_path
-                .parent()
-                .context("Config file does not have a parent")
-                .and_then(|target_config_parent| {
-                    File::open(target_config_parent).context("Failed to open config parent")
-                })
-                .and_then(|tenant_dir| {
-                    tenant_dir
-                        .sync_all()
-                        .context("Failed to fsync config parent")
-                })
-                .with_context(|| {
-                    format!(
-                        "Failed to fsync on first save for config {}",
-                        target_config_path.display()
-                    )
-                })?;
-        }
+            info!("persisting tenantconf to {}", target_config_path.display());
 
-        Ok(())
+            let mut conf_content = r#"# This file contains a specific per-tenant's config.
+#  It is read in case of pageserver restart.
+
+[tenant_config]
+"#
+            .to_string();
+
+            // Convert the config to a toml file.
+            conf_content += &toml_edit::easy::to_string(&tenant_conf)?;
+
+            let mut target_config_file = VirtualFile::open_with_options(
+                target_config_path,
+                OpenOptions::new()
+                    .truncate(true) // This needed for overwriting with small config files
+                    .write(true)
+                    .create_new(creating_tenant)
+                    // when creating a new tenant, first_save will be true and `.create(true)` will be
+                    // ignored (per rust std docs).
+                    //
+                    // later when updating the config of created tenant, or persisting config for the
+                    // first time for attached tenant, the `.create(true)` is used.
+                    .create(true),
+            )?;
+
+            target_config_file
+                .write(conf_content.as_bytes())
+                .context("write toml bytes into file")
+                .and_then(|_| target_config_file.sync_all().context("fsync config file"))
+                .context("write config file")?;
+
+            // fsync the parent directory to ensure the directory entry is durable.
+            // before this was done conditionally on creating_tenant, but these management actions are rare
+            // enough to just fsync it always.
+
+            crashsafe::fsync(target_config_parent)?;
+            Ok(())
+        };
+
+        // this function is called from creating the tenant and updating the tenant config, which
+        // would otherwise share this context, so keep it here in one place.
+        do_persist(target_config_path).with_context(|| {
+            format!(
+                "write tenant {tenant_id} config to {}",
+                target_config_path.display()
+            )
+        })
     }
 
     //
@@ -1871,12 +1911,13 @@ impl Tenant {
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
+        ctx: &RequestContext,
     ) -> anyhow::Result<GcResult> {
         let mut totals: GcResult = Default::default();
         let now = Instant::now();
 
         let gc_timelines = self
-            .refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+            .refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx)
             .await?;
 
         utils::failpoint_sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
@@ -1917,7 +1958,10 @@ impl Tenant {
     /// [`Tenant::get_gc_horizon`].
     ///
     /// This is usually executed as part of periodic gc, but can now be triggered more often.
-    pub async fn refresh_gc_info(&self) -> anyhow::Result<Vec<Arc<Timeline>>> {
+    pub async fn refresh_gc_info(
+        &self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Vec<Arc<Timeline>>> {
         // since this method can now be called at different rates than the configured gc loop, it
         // might be that these configuration values get applied faster than what it was previously,
         // since these were only read from the gc task.
@@ -1927,7 +1971,7 @@ impl Tenant {
         // refresh all timelines
         let target_timeline_id = None;
 
-        self.refresh_gc_info_internal(target_timeline_id, horizon, pitr)
+        self.refresh_gc_info_internal(target_timeline_id, horizon, pitr, ctx)
             .await
     }
 
@@ -1936,6 +1980,7 @@ impl Tenant {
         target_timeline_id: Option<TimelineId>,
         horizon: u64,
         pitr: Duration,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Vec<Arc<Timeline>>> {
         // grab mutex to prevent new timelines from being created here.
         let gc_cs = self.gc_cs.lock().await;
@@ -2007,7 +2052,9 @@ impl Tenant {
                     ))
                     .map(|&x| x.1)
                     .collect();
-                timeline.update_gc_info(branchpoints, cutoff, pitr).await?;
+                timeline
+                    .update_gc_info(branchpoints, cutoff, pitr, ctx)
+                    .await?;
 
                 gc_timelines.push(timeline);
             }
@@ -2019,53 +2066,53 @@ impl Tenant {
     /// Branch an existing timeline
     async fn branch_timeline(
         &self,
-        src: TimelineId,
-        dst: TimelineId,
+        src_timeline: &Arc<Timeline>,
+        dst_id: TimelineId,
         start_lsn: Option<Lsn>,
+        _ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
-        // We need to hold this lock to prevent GC from starting at the same time. GC scans the directory to learn
-        // about timelines, so otherwise a race condition is possible, where we create new timeline and GC
-        // concurrently removes data that is needed by the new timeline.
-        let _gc_cs = self.gc_cs.lock().await;
-        let timeline_uninit_mark = {
-            let timelines = self.timelines.lock().unwrap();
-            self.create_timeline_uninit_mark(dst, &timelines)?
-        };
-
-        // In order for the branch creation task to not wait for GC/compaction,
-        // we need to make sure that the starting LSN of the child branch is not out of scope midway by
-        //
-        // 1. holding the GC lock to prevent overwritting timeline's GC data
-        // 2. checking both the latest GC cutoff LSN and latest GC info of the source timeline
-        //
-        // Step 2 is to avoid initializing the new branch using data removed by past GC iterations
-        // or in-queue GC iterations.
-
-        let src_timeline = self.get_timeline(src, false).with_context(|| {
-            format!(
-                "No ancestor {} found for timeline {}/{}",
-                src, self.tenant_id, dst
-            )
-        })?;
-
-        let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
+        let src_id = src_timeline.timeline_id;
 
         // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
         let start_lsn = start_lsn.unwrap_or_else(|| {
             let lsn = src_timeline.get_last_record_lsn();
-            info!("branching timeline {dst} from timeline {src} at last record LSN: {lsn}");
+            info!("branching timeline {dst_id} from timeline {src_id} at last record LSN: {lsn}");
             lsn
         });
 
-        // Check if the starting LSN is out of scope because it is less than
-        // 1. the latest GC cutoff LSN or
-        // 2. the planned GC cutoff LSN, which is from an in-queue GC iteration.
+        // First acquire the GC lock so that another task cannot advance the GC
+        // cutoff in 'gc_info', and make 'start_lsn' invalid, while we are
+        // creating the branch.
+        let _gc_cs = self.gc_cs.lock().await;
+
+        // Create a placeholder for the new branch. This will error
+        // out if the new timeline ID is already in use.
+        let timeline_uninit_mark = {
+            let timelines = self.timelines.lock().unwrap();
+            self.create_timeline_uninit_mark(dst_id, &timelines)?
+        };
+
+        // Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR
+        // horizon on the source timeline
+        //
+        // We check it against both the planned GC cutoff stored in 'gc_info',
+        // and the 'latest_gc_cutoff' of the last GC that was performed.  The
+        // planned GC cutoff in 'gc_info' is normally larger than
+        // 'latest_gc_cutoff_lsn', but beware of corner cases like if you just
+        // changed the GC settings for the tenant to make the PITR window
+        // larger, but some of the data was already removed by an earlier GC
+        // iteration.
+
+        // check against last actual 'latest_gc_cutoff' first
+        let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
         src_timeline
             .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
             .context(format!(
                 "invalid branch start lsn: less than latest GC cutoff {}",
                 *latest_gc_cutoff_lsn,
             ))?;
+
+        // and then the planned GC cutoff
         {
             let gc_info = src_timeline.gc_info.read().unwrap();
             let cutoff = min(gc_info.pitr_cutoff, gc_info.horizon_cutoff);
@@ -2076,6 +2123,12 @@ impl Tenant {
             }
         }
 
+        //
+        // The branch point is valid, and we are still holding the 'gc_cs' lock
+        // so that GC cannot advance the GC cutoff until we are finished.
+        // Proceed with the branch creation.
+        //
+
         // Determine prev-LSN for the new timeline. We can only determine it if
         // the timeline was branched at the current end of the source timeline.
         let RecordLsn {
@@ -2094,7 +2147,7 @@ impl Tenant {
         let metadata = TimelineMetadata::new(
             start_lsn,
             dst_prev,
-            Some(src),
+            Some(src_id),
             start_lsn,
             *src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
             src_timeline.initdb_lsn,
@@ -2103,15 +2156,15 @@ impl Tenant {
         let mut timelines = self.timelines.lock().unwrap();
         let new_timeline = self
             .prepare_timeline(
-                dst,
+                dst_id,
                 metadata,
                 timeline_uninit_mark,
                 false,
-                Some(src_timeline),
+                Some(Arc::clone(src_timeline)),
             )?
             .initialize_with_lock(&mut timelines, true, true)?;
         drop(timelines);
-        info!("branched timeline {dst} from {src} at {start_lsn}");
+        info!("branched timeline {dst_id} from {src_id} at {start_lsn}");
 
         Ok(new_timeline)
     }
@@ -2122,6 +2175,7 @@ impl Tenant {
         &self,
         timeline_id: TimelineId,
         pg_version: u32,
+        ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
         let timeline_uninit_mark = {
             let timelines = self.timelines.lock().unwrap();
@@ -2181,6 +2235,7 @@ impl Tenant {
             unfinished_timeline,
             pgdata_path,
             pgdata_lsn,
+            ctx,
         )
         .await
         .with_context(|| {
@@ -2352,7 +2407,10 @@ impl Tenant {
     ///
     /// Future is cancellation safe. Only one calculation can be running at once per tenant.
     #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
-    pub async fn gather_size_inputs(&self) -> anyhow::Result<size::ModelInputs> {
+    pub async fn gather_size_inputs(
+        &self,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<size::ModelInputs> {
         let logical_sizes_at_once = self
             .conf
             .concurrent_tenant_size_logical_size_queries
@@ -2364,15 +2422,15 @@ impl Tenant {
         // See more for on the issue #2748 condenced out of the initial PR review.
         let mut shared_cache = self.cached_logical_sizes.lock().await;
 
-        size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache).await
+        size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache, ctx).await
     }
 
     /// Calculate synthetic tenant size
     /// This is periodically called by background worker.
     /// result is cached in tenant struct
     #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
-    pub async fn calculate_synthetic_size(&self) -> anyhow::Result<u64> {
-        let inputs = self.gather_size_inputs().await?;
+    pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result<u64> {
+        let inputs = self.gather_size_inputs(ctx).await?;
 
         let size = inputs.calculate()?;
 
@@ -2475,26 +2533,19 @@ fn try_create_target_tenant_dir(
         target_tenant_directory,
         temporary_tenant_dir,
     )
-    .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary timelines dir"))?;
+    .with_context(|| format!("resolve tenant {tenant_id} temporary timelines dir"))?;
     let temporary_tenant_config_path = rebase_directory(
         &conf.tenant_config_path(tenant_id),
         target_tenant_directory,
         temporary_tenant_dir,
     )
-    .with_context(|| format!("Failed to resolve tenant {tenant_id} temporary config path"))?;
+    .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;
+
+    Tenant::persist_tenant_config(&tenant_id, &temporary_tenant_config_path, tenant_conf, true)?;
 
-    Tenant::persist_tenant_config(&temporary_tenant_config_path, tenant_conf, true).with_context(
-        || {
-            format!(
-                "Failed to write tenant {} config to {}",
-                tenant_id,
-                temporary_tenant_config_path.display()
-            )
-        },
-    )?;
     crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
         format!(
-            "could not create tenant {} temporary timelines directory {}",
+            "create tenant {} temporary timelines directory {}",
             tenant_id,
             temporary_tenant_timelines_dir.display()
         )
@@ -2505,7 +2556,7 @@ fn try_create_target_tenant_dir(
 
     fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| {
         format!(
-            "failed to move tenant {} temporary directory {} into the permanent one {}",
+            "move tenant {} temporary directory {} into the permanent one {}",
             tenant_id,
             temporary_tenant_dir.display(),
             target_tenant_directory.display()
@@ -2513,14 +2564,14 @@ fn try_create_target_tenant_dir(
     })?;
     let target_dir_parent = target_tenant_directory.parent().with_context(|| {
         format!(
-            "Failed to get tenant {} dir parent for {}",
+            "get tenant {} dir parent for {}",
             tenant_id,
             target_tenant_directory.display()
         )
     })?;
     crashsafe::fsync(target_dir_parent).with_context(|| {
         format!(
-            "Failed to fsync renamed directory's parent {} for tenant {}",
+            "fsync renamed directory's parent {} for tenant {}",
             target_dir_parent.display(),
             tenant_id,
         )
@@ -2743,11 +2794,17 @@ pub mod harness {
             })
         }
 
-        pub async fn load(&self) -> Arc<Tenant> {
-            self.try_load().await.expect("failed to load test tenant")
+        pub async fn load(&self) -> (Arc<Tenant>, RequestContext) {
+            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
+            (
+                self.try_load(&ctx)
+                    .await
+                    .expect("failed to load test tenant"),
+                ctx,
+            )
         }
 
-        pub async fn try_load(&self) -> anyhow::Result<Arc<Tenant>> {
+        pub async fn try_load(&self, ctx: &RequestContext) -> anyhow::Result<Arc<Tenant>> {
             let walredo_mgr = Arc::new(TestRedoManager);
 
             let tenant = Arc::new(Tenant::new(
@@ -2775,8 +2832,7 @@ pub mod harness {
                 timelines_to_load.insert(timeline_id, timeline_metadata);
             }
             // FIXME starts background jobs
-            tenant.load().await?;
-
+            tenant.load(ctx).await?;
             Ok(tenant)
         }
 
@@ -2833,10 +2889,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_basic() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_basic")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) = TenantHarness::create("test_basic")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
@@ -2849,15 +2904,15 @@ mod tests {
         drop(writer);
 
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x10)).await?,
+            tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
             TEST_IMG("foo at 0x10")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x1f)).await?,
+            tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
             TEST_IMG("foo at 0x10")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x20)).await?,
+            tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
             TEST_IMG("foo at 0x20")
         );
 
@@ -2866,14 +2921,14 @@ mod tests {
 
     #[tokio::test]
     async fn no_duplicate_timelines() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("no_duplicate_timelines")?
+        let (tenant, ctx) = TenantHarness::create("no_duplicate_timelines")?
             .load()
             .await;
-        let _ = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let timeline =
+            tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let _ = timeline.initialize(&ctx)?;
 
-        match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION) {
+        match tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx) {
             Ok(_) => panic!("duplicate timeline creation should fail"),
             Err(e) => assert_eq!(
                 e.to_string(),
@@ -2899,13 +2954,13 @@ mod tests {
     ///
     #[tokio::test]
     async fn test_branch() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_branch")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
-        let writer = tline.writer();
         use std::str::from_utf8;
 
+        let (tenant, ctx) = TenantHarness::create("test_branch")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
+        let writer = tline.writer();
+
         #[allow(non_snake_case)]
         let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
         #[allow(non_snake_case)]
@@ -2925,7 +2980,7 @@ mod tests {
 
         // Branch the history, modify relation differently on the new timeline
         tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x30)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x30)), &ctx)
             .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
@@ -2936,15 +2991,15 @@ mod tests {
 
         // Check page contents on both branches
         assert_eq!(
-            from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40)).await?)?,
+            from_utf8(&tline.get(TEST_KEY_A, Lsn(0x40), &ctx).await?)?,
             "foo at 0x40"
         );
         assert_eq!(
-            from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40)).await?)?,
+            from_utf8(&newtline.get(TEST_KEY_A, Lsn(0x40), &ctx).await?)?,
             "bar at 0x40"
         );
         assert_eq!(
-            from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40)).await?)?,
+            from_utf8(&newtline.get(TEST_KEY_B, Lsn(0x40), &ctx).await?)?,
             "foobar at 0x20"
         );
 
@@ -2996,13 +3051,12 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
-        let tenant =
+        let (tenant, ctx) =
             TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")?
                 .load()
                 .await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
@@ -3010,12 +3064,12 @@ mod tests {
         // and compaction works. But it does set the 'cutoff' point so that the cross check
         // below should fail.
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
             .await?;
 
         // try to branch at lsn 25, should fail because we already garbage collected the data
         match tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx)
             .await
         {
             Ok(_) => panic!("branching should have failed"),
@@ -3034,16 +3088,17 @@ mod tests {
 
     #[tokio::test]
     async fn test_prohibit_branch_creation_on_pre_initdb_lsn() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
-            .load()
-            .await;
+        let (tenant, ctx) =
+            TenantHarness::create("test_prohibit_branch_creation_on_pre_initdb_lsn")?
+                .load()
+                .await;
 
-        tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let tline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0x50), DEFAULT_PG_VERSION, &ctx)?
+            .initialize(&ctx)?;
         // try to branch at lsn 0x25, should fail because initdb lsn is 0x50
         match tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x25)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x25)), &ctx)
             .await
         {
             Ok(_) => panic!("branching should have failed"),
@@ -3085,40 +3140,40 @@ mod tests {
 
     #[tokio::test]
     async fn test_retain_data_in_parent_which_is_needed_for_child() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
-            .load()
-            .await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) =
+            TenantHarness::create("test_retain_data_in_parent_which_is_needed_for_child")?
+                .load()
+                .await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
             .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
             .expect("Should have a local timeline");
         // this removes layers before lsn 40 (50 minus 10), so there are two remaining layers, image and delta for 31-50
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
             .await?;
-        assert!(newtline.get(*TEST_KEY, Lsn(0x25)).await.is_ok());
+        assert!(newtline.get(*TEST_KEY, Lsn(0x25), &ctx).await.is_ok());
 
         Ok(())
     }
     #[tokio::test]
     async fn test_parent_keeps_data_forever_after_branching() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
-            .load()
-            .await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) =
+            TenantHarness::create("test_parent_keeps_data_forever_after_branching")?
+                .load()
+                .await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
         make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
         tenant
-            .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+            .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
             .await?;
         let newtline = tenant
             .get_timeline(NEW_TIMELINE_ID, true)
@@ -3128,12 +3183,12 @@ mod tests {
 
         // run gc on parent
         tenant
-            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)
+            .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
             .await?;
 
         // Check that the data is still accessible on the branch.
         assert_eq!(
-            newtline.get(*TEST_KEY, Lsn(0x50)).await?,
+            newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?,
             TEST_IMG(&format!("foo at {}", Lsn(0x40)))
         );
 
@@ -3145,14 +3200,14 @@ mod tests {
         const TEST_NAME: &str = "timeline_load";
         let harness = TenantHarness::create(TEST_NAME)?;
         {
-            let tenant = harness.load().await;
-            let tline = tenant
-                .create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION)?
-                .initialize()?;
+            let (tenant, ctx) = harness.load().await;
+            let tline =
+                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0x8000), DEFAULT_PG_VERSION, &ctx)?;
+            let tline = tline.initialize(&ctx)?;
             make_some_layers(tline.as_ref(), Lsn(0x8000)).await?;
         }
 
-        let tenant = harness.load().await;
+        let (tenant, _ctx) = harness.load().await;
         tenant
             .get_timeline(TIMELINE_ID, true)
             .expect("cannot load timeline");
@@ -3166,15 +3221,15 @@ mod tests {
         let harness = TenantHarness::create(TEST_NAME)?;
         // create two timelines
         {
-            let tenant = harness.load().await;
-            let tline = tenant
-                .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-                .initialize()?;
+            let (tenant, ctx) = harness.load().await;
+            let tline =
+                tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+            let tline = tline.initialize(&ctx)?;
 
             make_some_layers(tline.as_ref(), Lsn(0x20)).await?;
 
             tenant
-                .branch_timeline(TIMELINE_ID, NEW_TIMELINE_ID, Some(Lsn(0x40)))
+                .branch_timeline(&tline, NEW_TIMELINE_ID, Some(Lsn(0x40)), &ctx)
                 .await?;
 
             let newtline = tenant
@@ -3185,7 +3240,7 @@ mod tests {
         }
 
         // check that both of them are initially unloaded
-        let tenant = harness.load().await;
+        let (tenant, _ctx) = harness.load().await;
 
         // check that both, child and ancestor are loaded
         let _child_tline = tenant
@@ -3203,11 +3258,11 @@ mod tests {
     async fn corrupt_metadata() -> anyhow::Result<()> {
         const TEST_NAME: &str = "corrupt_metadata";
         let harness = TenantHarness::create(TEST_NAME)?;
-        let tenant = harness.load().await;
+        let (tenant, ctx) = harness.load().await;
 
         tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
+            .initialize(&ctx)?;
         drop(tenant);
 
         let metadata_path = harness.timeline_path(&TIMELINE_ID).join(METADATA_FILE_NAME);
@@ -3219,7 +3274,7 @@ mod tests {
         metadata_bytes[8] ^= 1;
         std::fs::write(metadata_path, metadata_bytes)?;
 
-        let err = harness.try_load().await.err().expect("should fail");
+        let err = harness.try_load(&ctx).await.err().expect("should fail");
         assert!(err
             .to_string()
             .starts_with("Failed to parse metadata bytes from path"));
@@ -3243,10 +3298,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_images() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_images")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) = TenantHarness::create("test_images")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x10), &Value::Image(TEST_IMG("foo at 0x10")))?;
@@ -3254,7 +3308,7 @@ mod tests {
         drop(writer);
 
         tline.freeze_and_flush().await?;
-        tline.compact().await?;
+        tline.compact(&ctx).await?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x20), &Value::Image(TEST_IMG("foo at 0x20")))?;
@@ -3262,7 +3316,7 @@ mod tests {
         drop(writer);
 
         tline.freeze_and_flush().await?;
-        tline.compact().await?;
+        tline.compact(&ctx).await?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x30), &Value::Image(TEST_IMG("foo at 0x30")))?;
@@ -3270,7 +3324,7 @@ mod tests {
         drop(writer);
 
         tline.freeze_and_flush().await?;
-        tline.compact().await?;
+        tline.compact(&ctx).await?;
 
         let writer = tline.writer();
         writer.put(*TEST_KEY, Lsn(0x40), &Value::Image(TEST_IMG("foo at 0x40")))?;
@@ -3278,26 +3332,26 @@ mod tests {
         drop(writer);
 
         tline.freeze_and_flush().await?;
-        tline.compact().await?;
+        tline.compact(&ctx).await?;
 
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x10)).await?,
+            tline.get(*TEST_KEY, Lsn(0x10), &ctx).await?,
             TEST_IMG("foo at 0x10")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x1f)).await?,
+            tline.get(*TEST_KEY, Lsn(0x1f), &ctx).await?,
             TEST_IMG("foo at 0x10")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x20)).await?,
+            tline.get(*TEST_KEY, Lsn(0x20), &ctx).await?,
             TEST_IMG("foo at 0x20")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x30)).await?,
+            tline.get(*TEST_KEY, Lsn(0x30), &ctx).await?,
             TEST_IMG("foo at 0x30")
         );
         assert_eq!(
-            tline.get(*TEST_KEY, Lsn(0x40)).await?,
+            tline.get(*TEST_KEY, Lsn(0x40), &ctx).await?,
             TEST_IMG("foo at 0x40")
         );
 
@@ -3310,10 +3364,9 @@ mod tests {
     //
     #[tokio::test]
     async fn test_bulk_insert() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_bulk_insert")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) = TenantHarness::create("test_bulk_insert")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
 
         let mut lsn = Lsn(0x10);
 
@@ -3342,10 +3395,10 @@ mod tests {
             let cutoff = tline.get_last_record_lsn();
 
             tline
-                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
                 .await?;
             tline.freeze_and_flush().await?;
-            tline.compact().await?;
+            tline.compact(&ctx).await?;
             tline.gc().await?;
         }
 
@@ -3354,10 +3407,9 @@ mod tests {
 
     #[tokio::test]
     async fn test_random_updates() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_random_updates")?.load().await;
-        let tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+        let (tenant, ctx) = TenantHarness::create("test_random_updates")?.load().await;
+        let tline = tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
+        let tline = tline.initialize(&ctx)?;
 
         const NUM_KEYS: usize = 1000;
 
@@ -3407,7 +3459,7 @@ mod tests {
             for (blknum, last_lsn) in updated.iter().enumerate() {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, lsn).await?,
+                    tline.get(test_key, lsn, &ctx).await?,
                     TEST_IMG(&format!("{} at {}", blknum, last_lsn))
                 );
             }
@@ -3415,10 +3467,10 @@ mod tests {
             // Perform a cycle of flush, compact, and GC
             let cutoff = tline.get_last_record_lsn();
             tline
-                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
                 .await?;
             tline.freeze_and_flush().await?;
-            tline.compact().await?;
+            tline.compact(&ctx).await?;
             tline.gc().await?;
         }
 
@@ -3427,12 +3479,12 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_branches() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_traverse_branches")?
+        let (tenant, ctx) = TenantHarness::create("test_traverse_branches")?
             .load()
             .await;
         let mut tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
+            .initialize(&ctx)?;
 
         const NUM_KEYS: usize = 1000;
 
@@ -3462,16 +3514,14 @@ mod tests {
             keyspace.add_key(test_key);
         }
 
-        let mut tline_id = TIMELINE_ID;
         for _ in 0..50 {
             let new_tline_id = TimelineId::generate();
             tenant
-                .branch_timeline(tline_id, new_tline_id, Some(lsn))
+                .branch_timeline(&tline, new_tline_id, Some(lsn), &ctx)
                 .await?;
             tline = tenant
                 .get_timeline(new_tline_id, true)
                 .expect("Should have the branched timeline");
-            tline_id = new_tline_id;
 
             for _ in 0..NUM_KEYS {
                 lsn = Lsn(lsn.0 + 0x10);
@@ -3493,7 +3543,7 @@ mod tests {
             for (blknum, last_lsn) in updated.iter().enumerate() {
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, lsn).await?,
+                    tline.get(test_key, lsn, &ctx).await?,
                     TEST_IMG(&format!("{} at {}", blknum, last_lsn))
                 );
             }
@@ -3501,10 +3551,10 @@ mod tests {
             // Perform a cycle of flush, compact, and GC
             let cutoff = tline.get_last_record_lsn();
             tline
-                .update_gc_info(Vec::new(), cutoff, Duration::ZERO)
+                .update_gc_info(Vec::new(), cutoff, Duration::ZERO, &ctx)
                 .await?;
             tline.freeze_and_flush().await?;
-            tline.compact().await?;
+            tline.compact(&ctx).await?;
             tline.gc().await?;
         }
 
@@ -3513,12 +3563,12 @@ mod tests {
 
     #[tokio::test]
     async fn test_traverse_ancestors() -> anyhow::Result<()> {
-        let tenant = TenantHarness::create("test_traverse_ancestors")?
+        let (tenant, ctx) = TenantHarness::create("test_traverse_ancestors")?
             .load()
             .await;
         let mut tline = tenant
-            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION)?
-            .initialize()?;
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?
+            .initialize(&ctx)?;
 
         const NUM_KEYS: usize = 100;
         const NUM_TLINES: usize = 50;
@@ -3528,18 +3578,16 @@ mod tests {
         let mut updated = [[Lsn(0); NUM_KEYS]; NUM_TLINES];
 
         let mut lsn = Lsn(0);
-        let mut tline_id = TIMELINE_ID;
 
         #[allow(clippy::needless_range_loop)]
         for idx in 0..NUM_TLINES {
             let new_tline_id = TimelineId::generate();
             tenant
-                .branch_timeline(tline_id, new_tline_id, Some(lsn))
+                .branch_timeline(&tline, new_tline_id, Some(lsn), &ctx)
                 .await?;
             tline = tenant
                 .get_timeline(new_tline_id, true)
                 .expect("Should have the branched timeline");
-            tline_id = new_tline_id;
 
             for _ in 0..NUM_KEYS {
                 lsn = Lsn(lsn.0 + 0x10);
@@ -3568,7 +3616,7 @@ mod tests {
                 println!("checking [{idx}][{blknum}] at {lsn}");
                 test_key.field6 = blknum as u32;
                 assert_eq!(
-                    tline.get(test_key, *lsn).await?,
+                    tline.get(test_key, *lsn, &ctx).await?,
                     TEST_IMG(&format!("{idx} {blknum} at {lsn}"))
                 );
             }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index c95a98fbc7..e66ee0ae36 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -28,7 +28,12 @@ pub mod defaults {
     pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;
 
     pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;
-    pub const DEFAULT_GC_PERIOD: &str = "100 s";
+
+    // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
+    // If there's a need to decrease this value, first make sure that GC
+    // doesn't hold a layer map write lock for non-trivial operations.
+    // Relevant: https://github.com/neondatabase/neon/issues/3394
+    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
     pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
     pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
     pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "2 seconds";
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 01c5359e88..ed1a32c8fd 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -9,24 +9,57 @@
 //! are frozen, and it is split up into new image and delta layers and the
 //! corresponding files are written to disk.
 //!
+//! Design overview:
+//!
+//! The `search` method of the layer map is on the read critical path, so we've
+//! built an efficient data structure for fast reads, stored in `LayerMap::historic`.
+//! Other read methods are less critical but still impact performance of background tasks.
+//!
+//! This data structure relies on a persistent/immutable binary search tree. See the
+//! following lecture for an introduction https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
+//! Summary: A persistent/immutable BST (and persistent data structures in general) allows
+//! you to modify the tree in such a way that each modification creates a new "version"
+//! of the tree. When you modify it, you get a new version, but all previous versions are
+//! still accessible too. So if someone is still holding a reference to an older version,
+//! they continue to see the tree as it was then. The persistent BST stores all the
+//! different versions in an efficient way.
+//!
+//! Our persistent BST maintains a map of which layer file "covers" each key. It has only
+//! one dimension, the key. See `layer_coverage.rs`. We use the persistent/immutable property
+//! to handle the LSN dimension.
+//!
+//! To build the layer map, we insert each layer to the persistent BST in LSN.start order,
+//! starting from the oldest one. After each insertion, we grab a reference to that "version"
+//! of the tree, and store it in another tree, a BtreeMap keyed by the LSN. See
+//! `historic_layer_coverage.rs`.
+//!
+//! To search for a particular key-LSN pair, you first look up the right "version" in the
+//! BTreeMap. Then you search that version of the BST with the key.
+//!
+//! The persistent BST keeps all the versions, but there is no way to change the old versions
+//! afterwards. We can add layers as long as they have larger LSNs than any previous layer in
+//! the map, but if we need to remove a layer, or insert anything with an older LSN, we need
+//! to throw away most of the persistent BST and build a new one, starting from the oldest
+//! LSN. See `LayerMap::flush_updates()`.
+//!
 
+mod historic_layer_coverage;
+mod layer_coverage;
+
+use crate::keyspace::KeyPartitioning;
 use crate::metrics::NUM_ONDISK_LAYERS;
 use crate::repository::Key;
-use crate::tenant::storage_layer::{range_eq, range_overlaps};
-use amplify_num::i256;
+use crate::tenant::storage_layer::InMemoryLayer;
+use crate::tenant::storage_layer::Layer;
 use anyhow::Result;
-use num_traits::identities::{One, Zero};
-use num_traits::{Bounded, Num, Signed};
-use rstar::{RTree, RTreeObject, AABB};
-use std::cmp::Ordering;
 use std::collections::VecDeque;
 use std::ops::Range;
-use std::ops::{Add, Div, Mul, Neg, Rem, Sub};
 use std::sync::Arc;
-use tracing::*;
 use utils::lsn::Lsn;
 
-use super::storage_layer::{InMemoryLayer, Layer};
+use historic_layer_coverage::BufferedHistoricLayerCoverage;
+
+use super::storage_layer::range_eq;
 
 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -51,8 +84,8 @@ pub struct LayerMap<L: ?Sized> {
     ///
     pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
 
-    /// All the historic layers are kept here
-    historic_layers: RTree<LayerRTreeObject<L>>,
+    /// Index of the historic layers optimized for search
+    historic: BufferedHistoricLayerCoverage<Arc<L>>,
 
     /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
     /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
@@ -65,177 +98,64 @@ impl<L: ?Sized> Default for LayerMap<L> {
             open_layer: None,
             next_open_layer_at: None,
             frozen_layers: VecDeque::default(),
-            historic_layers: RTree::default(),
             l0_delta_layers: Vec::default(),
+            historic: BufferedHistoricLayerCoverage::default(),
         }
     }
 }
 
-struct LayerRTreeObject<L: ?Sized> {
-    layer: Arc<L>,
-
-    envelope: AABB<[IntKey; 2]>,
+/// The primary update API for the layer map.
+///
+/// Batching historic layer insertions and removals is good for
+/// performance and this struct helps us do that correctly.
+#[must_use]
+pub struct BatchedUpdates<'a, L: ?Sized + Layer> {
+    // While we hold this exclusive reference to the layer map the type checker
+    // will prevent us from accidentally reading any unflushed updates.
+    layer_map: &'a mut LayerMap<L>,
 }
 
-// Representation of Key as numeric type.
-// We can not use native implementation of i128, because rstar::RTree
-// doesn't handle properly integer overflow during area calculation: sum(Xi*Yi).
-// Overflow will cause panic in debug mode and incorrect area calculation in release mode,
-// which leads to non-optimally balanced R-Tree (but doesn't fit correctness of R-Tree work).
-// By using i256 as the type, even though all the actual values would fit in i128, we can be
-// sure that multiplication doesn't overflow.
-//
-
-#[derive(Clone, PartialEq, Eq, PartialOrd, Debug)]
-struct IntKey(i256);
-
-impl Copy for IntKey {}
-
-impl IntKey {
-    fn from(i: i128) -> Self {
-        IntKey(i256::from(i))
-    }
-}
-
-impl Bounded for IntKey {
-    fn min_value() -> Self {
-        IntKey(i256::MIN)
-    }
-    fn max_value() -> Self {
-        IntKey(i256::MAX)
-    }
-}
-
-impl Signed for IntKey {
-    fn is_positive(&self) -> bool {
-        self.0 > i256::ZERO
-    }
-    fn is_negative(&self) -> bool {
-        self.0 < i256::ZERO
-    }
-    fn signum(&self) -> Self {
-        match self.0.cmp(&i256::ZERO) {
-            Ordering::Greater => IntKey(i256::ONE),
-            Ordering::Less => IntKey(-i256::ONE),
-            Ordering::Equal => IntKey(i256::ZERO),
-        }
-    }
-    fn abs(&self) -> Self {
-        IntKey(self.0.abs())
-    }
-    fn abs_sub(&self, other: &Self) -> Self {
-        if self.0 <= other.0 {
-            IntKey(i256::ZERO)
-        } else {
-            IntKey(self.0 - other.0)
-        }
-    }
-}
-
-impl Neg for IntKey {
-    type Output = Self;
-    fn neg(self) -> Self::Output {
-        IntKey(-self.0)
-    }
-}
-
-impl Rem for IntKey {
-    type Output = Self;
-    fn rem(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 % rhs.0)
-    }
-}
-
-impl Div for IntKey {
-    type Output = Self;
-    fn div(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 / rhs.0)
-    }
-}
-
-impl Add for IntKey {
-    type Output = Self;
-    fn add(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 + rhs.0)
-    }
-}
-
-impl Sub for IntKey {
-    type Output = Self;
-    fn sub(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 - rhs.0)
-    }
-}
-
-impl Mul for IntKey {
-    type Output = Self;
-    fn mul(self, rhs: Self) -> Self::Output {
-        IntKey(self.0 * rhs.0)
-    }
-}
-
-impl One for IntKey {
-    fn one() -> Self {
-        IntKey(i256::ONE)
-    }
-}
-
-impl Zero for IntKey {
-    fn zero() -> Self {
-        IntKey(i256::ZERO)
-    }
-    fn is_zero(&self) -> bool {
-        self.0 == i256::ZERO
-    }
-}
-
-impl Num for IntKey {
-    type FromStrRadixErr = <i128 as Num>::FromStrRadixErr;
-    fn from_str_radix(str: &str, radix: u32) -> Result<Self, Self::FromStrRadixErr> {
-        Ok(IntKey(i256::from(i128::from_str_radix(str, radix)?)))
-    }
-}
-
-impl<T: ?Sized> PartialEq for LayerRTreeObject<T> {
-    fn eq(&self, other: &Self) -> bool {
-        // FIXME: ptr_eq might fail to return true for 'dyn'
-        // references.  Clippy complains about this. In practice it
-        // seems to work, the assertion below would be triggered
-        // otherwise but this ought to be fixed.
-        #[allow(clippy::vtable_address_comparisons)]
-        Arc::ptr_eq(&self.layer, &other.layer)
-    }
-}
-
-impl<L> RTreeObject for LayerRTreeObject<L>
-where
-    L: ?Sized,
-{
-    type Envelope = AABB<[IntKey; 2]>;
-    fn envelope(&self) -> Self::Envelope {
-        self.envelope
-    }
-}
-
-impl<L> LayerRTreeObject<L>
+/// Provide ability to batch more updates while hiding the read
+/// API so we don't accidentally read without flushing.
+impl<L> BatchedUpdates<'_, L>
 where
     L: ?Sized + Layer,
 {
-    fn new(layer: Arc<L>) -> Self {
-        let key_range = layer.get_key_range();
-        let lsn_range = layer.get_lsn_range();
+    ///
+    /// Insert an on-disk layer.
+    ///
+    pub fn insert_historic(&mut self, layer: Arc<L>) {
+        self.layer_map.insert_historic_noflush(layer)
+    }
 
-        let envelope = AABB::from_corners(
-            [
-                IntKey::from(key_range.start.to_i128()),
-                IntKey::from(lsn_range.start.0 as i128),
-            ],
-            [
-                IntKey::from(key_range.end.to_i128() - 1),
-                IntKey::from(lsn_range.end.0 as i128 - 1),
-            ], // AABB::upper is inclusive, while `key_range.end` and `lsn_range.end` are exclusive
-        );
-        LayerRTreeObject { layer, envelope }
+    ///
+    /// Remove an on-disk layer from the map.
+    ///
+    /// This should be called when the corresponding file on disk has been deleted.
+    ///
+    pub fn remove_historic(&mut self, layer: Arc<L>) {
+        self.layer_map.remove_historic_noflush(layer)
+    }
+
+    // We will flush on drop anyway, but this method makes it
+    // more explicit that there is some work being done.
+    /// Apply all updates
+    pub fn flush(self) {
+        // Flush happens on drop
+    }
+}
+
+// Ideally the flush() method should be called explicitly for more
+// controlled execution. But if we forget we'd rather flush on drop
+// than panic later or read without flushing.
+//
+// TODO maybe warn if flush hasn't explicitly been called
+impl<L> Drop for BatchedUpdates<'_, L>
+where
+    L: ?Sized + Layer,
+{
+    fn drop(&mut self) {
+        self.layer_map.flush_updates();
     }
 }
 
@@ -281,125 +201,91 @@ where
     /// 'open' and 'frozen' layers!
     ///
     pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult<L>> {
-        // Find the latest image layer that covers the given key
-        let mut latest_img: Option<Arc<L>> = None;
-        let mut latest_img_lsn: Option<Lsn> = None;
-        let envelope = AABB::from_corners(
-            [IntKey::from(key.to_i128()), IntKey::from(0i128)],
-            [
-                IntKey::from(key.to_i128()),
-                IntKey::from(end_lsn.0 as i128 - 1),
-            ],
-        );
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            if l.is_incremental() {
-                continue;
-            }
-            assert!(l.get_key_range().contains(&key));
-            let img_lsn = l.get_lsn_range().start;
-            assert!(img_lsn < end_lsn);
-            if Lsn(img_lsn.0 + 1) == end_lsn {
-                // found exact match
-                return Some(SearchResult {
-                    layer: Arc::clone(l),
-                    lsn_floor: img_lsn,
-                });
-            }
-            if img_lsn > latest_img_lsn.unwrap_or(Lsn(0)) {
-                latest_img = Some(Arc::clone(l));
-                latest_img_lsn = Some(img_lsn);
-            }
-        }
+        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
+        let latest_delta = version.delta_coverage.query(key.to_i128());
+        let latest_image = version.image_coverage.query(key.to_i128());
 
-        // Search the delta layers
-        let mut latest_delta: Option<Arc<L>> = None;
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            if !l.is_incremental() {
-                continue;
+        match (latest_delta, latest_image) {
+            (None, None) => None,
+            (None, Some(image)) => {
+                let lsn_floor = image.get_lsn_range().start;
+                Some(SearchResult {
+                    layer: image,
+                    lsn_floor,
+                })
             }
-            assert!(l.get_key_range().contains(&key));
-            if l.get_lsn_range().start >= end_lsn {
-                info!(
-                    "Candidate delta layer {}..{} is too new for lsn {}",
-                    l.get_lsn_range().start,
-                    l.get_lsn_range().end,
-                    end_lsn
-                );
+            (Some(delta), None) => {
+                let lsn_floor = delta.get_lsn_range().start;
+                Some(SearchResult {
+                    layer: delta,
+                    lsn_floor,
+                })
             }
-            assert!(l.get_lsn_range().start < end_lsn);
-            if l.get_lsn_range().end >= end_lsn {
-                // this layer contains the requested point in the key/lsn space.
-                // No need to search any further
-                trace!(
-                    "found layer {} for request on {key} at {end_lsn}",
-                    l.short_id(),
-                );
-                latest_delta.replace(Arc::clone(l));
-                break;
-            }
-            if l.get_lsn_range().end > latest_img_lsn.unwrap_or(Lsn(0)) {
-                // this layer's end LSN is smaller than the requested point. If there's
-                // nothing newer, this is what we need to return. Remember this.
-                if let Some(old_candidate) = &latest_delta {
-                    if l.get_lsn_range().end > old_candidate.get_lsn_range().end {
-                        latest_delta.replace(Arc::clone(l));
-                    }
+            (Some(delta), Some(image)) => {
+                let img_lsn = image.get_lsn_range().start;
+                let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
+                let image_exact_match = img_lsn + 1 == end_lsn;
+                if image_is_newer || image_exact_match {
+                    Some(SearchResult {
+                        layer: image,
+                        lsn_floor: img_lsn,
+                    })
                 } else {
-                    latest_delta.replace(Arc::clone(l));
+                    let lsn_floor =
+                        std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
+                    Some(SearchResult {
+                        layer: delta,
+                        lsn_floor,
+                    })
                 }
             }
         }
-        if let Some(l) = latest_delta {
-            trace!(
-                "found (old) layer {} for request on {key} at {end_lsn}",
-                l.short_id(),
-            );
-            let lsn_floor = std::cmp::max(
-                Lsn(latest_img_lsn.unwrap_or(Lsn(0)).0 + 1),
-                l.get_lsn_range().start,
-            );
-            Some(SearchResult {
-                lsn_floor,
-                layer: l,
-            })
-        } else if let Some(l) = latest_img {
-            trace!("found img layer and no deltas for request on {key} at {end_lsn}");
-            Some(SearchResult {
-                lsn_floor: latest_img_lsn.unwrap(),
-                layer: l,
-            })
-        } else {
-            trace!("no layer found for request on {key} at {end_lsn}");
-            None
-        }
+    }
+
+    /// Start a batch of updates, applied on drop
+    pub fn batch_update(&mut self) -> BatchedUpdates<'_, L> {
+        BatchedUpdates { layer_map: self }
     }
 
     ///
     /// Insert an on-disk layer
     ///
-    pub fn insert_historic(&mut self, layer: Arc<L>) {
-        if layer.get_key_range() == (Key::MIN..Key::MAX) {
-            self.l0_delta_layers.push(layer.clone());
+    /// Helper function for BatchedUpdates::insert_historic
+    ///
+    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
+        let kr = layer.get_key_range();
+        let lr = layer.get_lsn_range();
+        self.historic.insert(
+            historic_layer_coverage::LayerKey {
+                key: kr.start.to_i128()..kr.end.to_i128(),
+                lsn: lr.start.0..lr.end.0,
+                is_image: !layer.is_incremental(),
+            },
+            Arc::clone(&layer),
+        );
+
+        if Self::is_l0(&layer) {
+            self.l0_delta_layers.push(layer);
         }
-        self.historic_layers.insert(LayerRTreeObject::new(layer));
+
         NUM_ONDISK_LAYERS.inc();
     }
 
     ///
     /// Remove an on-disk layer from the map.
     ///
-    /// This should be called when the corresponding file on disk has been deleted.
+    /// Helper function for BatchedUpdates::remove_historic
     ///
-    pub fn remove_historic(&mut self, layer: Arc<L>) {
-        if layer.get_key_range() == (Key::MIN..Key::MAX) {
+    pub fn remove_historic_noflush(&mut self, layer: Arc<L>) {
+        let kr = layer.get_key_range();
+        let lr = layer.get_lsn_range();
+        self.historic.remove(historic_layer_coverage::LayerKey {
+            key: kr.start.to_i128()..kr.end.to_i128(),
+            lsn: lr.start.0..lr.end.0,
+            is_image: !layer.is_incremental(),
+        });
+
+        if Self::is_l0(&layer) {
             let len_before = self.l0_delta_layers.len();
 
             // FIXME: ptr_eq might fail to return true for 'dyn'
@@ -411,98 +297,57 @@ where
                 .retain(|other| !Arc::ptr_eq(other, &layer));
             assert_eq!(self.l0_delta_layers.len(), len_before - 1);
         }
-        assert!(self
-            .historic_layers
-            .remove(&LayerRTreeObject::new(layer))
-            .is_some());
+
         NUM_ONDISK_LAYERS.dec();
     }
 
+    /// Helper function for BatchedUpdates::drop.
+    pub(self) fn flush_updates(&mut self) {
+        self.historic.rebuild();
+    }
+
     /// Is there a newer image layer for given key- and LSN-range? Or a set
     /// of image layers within the specified lsn range that cover the entire
     /// specified key range?
     ///
     /// This is used for garbage collection, to determine if an old layer can
     /// be deleted.
-    pub fn image_layer_exists(
-        &self,
-        key_range: &Range<Key>,
-        lsn_range: &Range<Lsn>,
-    ) -> Result<bool> {
-        let mut range_remain = key_range.clone();
+    pub fn image_layer_exists(&self, key: &Range<Key>, lsn: &Range<Lsn>) -> Result<bool> {
+        if key.is_empty() {
+            // Vacuously true. There's a newer image for all 0 of the kerys in the range.
+            return Ok(true);
+        }
 
-        loop {
-            let mut made_progress = false;
-            let envelope = AABB::from_corners(
-                [
-                    IntKey::from(range_remain.start.to_i128()),
-                    IntKey::from(lsn_range.start.0 as i128),
-                ],
-                [
-                    IntKey::from(range_remain.end.to_i128() - 1),
-                    IntKey::from(lsn_range.end.0 as i128 - 1),
-                ],
-            );
-            for e in self
-                .historic_layers
-                .locate_in_envelope_intersecting(&envelope)
-            {
-                let l = &e.layer;
-                if l.is_incremental() {
-                    continue;
-                }
-                let img_lsn = l.get_lsn_range().start;
-                if l.get_key_range().contains(&range_remain.start) && lsn_range.contains(&img_lsn) {
-                    made_progress = true;
-                    let img_key_end = l.get_key_range().end;
+        let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
+            Some(v) => v,
+            None => return Ok(false),
+        };
 
-                    if img_key_end >= range_remain.end {
-                        return Ok(true);
-                    }
-                    range_remain.start = img_key_end;
-                }
-            }
+        let start = key.start.to_i128();
+        let end = key.end.to_i128();
 
-            if !made_progress {
+        let layer_covers = |layer: Option<Arc<L>>| match layer {
+            Some(layer) => layer.get_lsn_range().start >= lsn.start,
+            None => false,
+        };
+
+        // Check the start is covered
+        if !layer_covers(version.image_coverage.query(start)) {
+            return Ok(false);
+        }
+
+        // Check after all changes of coverage
+        for (_, change_val) in version.image_coverage.range(start..end) {
+            if !layer_covers(change_val) {
                 return Ok(false);
             }
         }
+
+        Ok(true)
     }
 
     pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
-        self.historic_layers.iter().map(|e| e.layer.clone())
-    }
-
-    /// Find the last image layer that covers 'key', ignoring any image layers
-    /// newer than 'lsn'.
-    fn find_latest_image(&self, key: Key, lsn: Lsn) -> Option<Arc<L>> {
-        let mut candidate_lsn = Lsn(0);
-        let mut candidate = None;
-        let envelope = AABB::from_corners(
-            [IntKey::from(key.to_i128()), IntKey::from(0)],
-            [IntKey::from(key.to_i128()), IntKey::from(lsn.0 as i128)],
-        );
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            if l.is_incremental() {
-                continue;
-            }
-
-            assert!(l.get_key_range().contains(&key));
-            let this_lsn = l.get_lsn_range().start;
-            assert!(this_lsn <= lsn);
-            if this_lsn < candidate_lsn {
-                // our previous candidate was better
-                continue;
-            }
-            candidate_lsn = this_lsn;
-            candidate = Some(Arc::clone(l));
-        }
-
-        candidate
+        self.historic.iter()
     }
 
     ///
@@ -518,94 +363,288 @@ where
         key_range: &Range<Key>,
         lsn: Lsn,
     ) -> Result<Vec<(Range<Key>, Option<Arc<L>>)>> {
-        let mut points = vec![key_range.start];
-        let envelope = AABB::from_corners(
-            [IntKey::from(key_range.start.to_i128()), IntKey::from(0)],
-            [
-                IntKey::from(key_range.end.to_i128()),
-                IntKey::from(lsn.0 as i128),
-            ],
-        );
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            assert!(l.get_lsn_range().start <= lsn);
-            let range = l.get_key_range();
-            if key_range.contains(&range.start) {
-                points.push(l.get_key_range().start);
-            }
-            if key_range.contains(&range.end) {
-                points.push(l.get_key_range().end);
-            }
+        let version = match self.historic.get().unwrap().get_version(lsn.0) {
+            Some(v) => v,
+            None => return Ok(vec![]),
+        };
+
+        let start = key_range.start.to_i128();
+        let end = key_range.end.to_i128();
+
+        // Initialize loop variables
+        let mut coverage: Vec<(Range<Key>, Option<Arc<L>>)> = vec![];
+        let mut current_key = start;
+        let mut current_val = version.image_coverage.query(start);
+
+        // Loop through the change events and push intervals
+        for (change_key, change_val) in version.image_coverage.range(start..end) {
+            let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
+            coverage.push((kr, current_val.take()));
+            current_key = change_key;
+            current_val = change_val.clone();
         }
-        points.push(key_range.end);
 
-        points.sort();
-        points.dedup();
+        // Add the final interval
+        let kr = Key::from_i128(current_key)..Key::from_i128(end);
+        coverage.push((kr, current_val.take()));
 
-        // Ok, we now have a list of "interesting" points in the key space
-
-        // For each range between the points, find the latest image
-        let mut start = *points.first().unwrap();
-        let mut ranges = Vec::new();
-        for end in points[1..].iter() {
-            let img = self.find_latest_image(start, lsn);
-
-            ranges.push((start..*end, img));
-
-            start = *end;
-        }
-        Ok(ranges)
+        Ok(coverage)
     }
 
-    /// Count the height of the tallest stack of deltas in this 2d region.
+    pub fn is_l0(layer: &L) -> bool {
+        range_eq(&layer.get_key_range(), &(Key::MIN..Key::MAX))
+    }
+
+    /// This function determines which layers are counted in `count_deltas`:
+    /// layers that should count towards deciding whether or not to reimage
+    /// a certain partition range.
+    ///
+    /// There are two kinds of layers we currently consider reimage-worthy:
+    ///
+    /// Case 1: Non-L0 layers are currently reimage-worthy by default.
+    /// TODO Some of these layers are very sparse and cover the entire key
+    ///      range. Replacing 256MB of data (or less!) with terabytes of
+    ///      images doesn't seem wise. We need a better heuristic, possibly
+    ///      based on some of these factors:
+    ///      a) whether this layer has any wal in this partition range
+    ///      b) the size of the layer
+    ///      c) the number of images needed to cover it
+    ///      d) the estimated time until we'll have to reimage over it for GC
+    ///
+    /// Case 2: Since L0 layers by definition cover the entire key space, we consider
+    /// them reimage-worthy only when the entire key space can be covered by very few
+    /// images (currently 1).
+    /// TODO The optimal number should probably be slightly higher than 1, but to
+    ///      implement that we need to plumb a lot more context into this function
+    ///      than just the current partition_range.
+    pub fn is_reimage_worthy(layer: &L, partition_range: &Range<Key>) -> bool {
+        // Case 1
+        if !Self::is_l0(layer) {
+            return true;
+        }
+
+        // Case 2
+        if range_eq(partition_range, &(Key::MIN..Key::MAX)) {
+            return true;
+        }
+
+        false
+    }
+
+    /// Count the height of the tallest stack of reimage-worthy deltas
+    /// in this 2d region.
+    ///
+    /// If `limit` is provided we don't try to count above that number.
     ///
     /// This number is used to compute the largest number of deltas that
     /// we'll need to visit for any page reconstruction in this region.
     /// We use this heuristic to decide whether to create an image layer.
-    ///
-    /// TODO currently we just return the total number of deltas in the
-    ///      region, no matter if they're stacked on top of each other
-    ///      or next to each other.
-    pub fn count_deltas(&self, key_range: &Range<Key>, lsn_range: &Range<Lsn>) -> Result<usize> {
-        let mut result = 0;
-        if lsn_range.start >= lsn_range.end {
+    pub fn count_deltas(
+        &self,
+        key: &Range<Key>,
+        lsn: &Range<Lsn>,
+        limit: Option<usize>,
+    ) -> Result<usize> {
+        // We get the delta coverage of the region, and for each part of the coverage
+        // we recurse right underneath the delta. The recursion depth is limited by
+        // the largest result this function could return, which is in practice between
+        // 3 and 10 (since we usually try to create an image when the number gets larger).
+
+        if lsn.is_empty() || key.is_empty() || limit == Some(0) {
             return Ok(0);
         }
-        let envelope = AABB::from_corners(
-            [
-                IntKey::from(key_range.start.to_i128()),
-                IntKey::from(lsn_range.start.0 as i128),
-            ],
-            [
-                IntKey::from(key_range.end.to_i128() - 1),
-                IntKey::from(lsn_range.end.0 as i128 - 1),
-            ],
-        );
-        for e in self
-            .historic_layers
-            .locate_in_envelope_intersecting(&envelope)
-        {
-            let l = &e.layer;
-            if !l.is_incremental() {
-                continue;
-            }
-            assert!(range_overlaps(&l.get_lsn_range(), lsn_range));
-            assert!(range_overlaps(&l.get_key_range(), key_range));
 
-            // We ignore level0 delta layers. Unless the whole keyspace fits
-            // into one partition
-            if !range_eq(key_range, &(Key::MIN..Key::MAX))
-                && range_eq(&l.get_key_range(), &(Key::MIN..Key::MAX))
-            {
-                continue;
+        let version = match self.historic.get().unwrap().get_version(lsn.end.0 - 1) {
+            Some(v) => v,
+            None => return Ok(0),
+        };
+
+        let start = key.start.to_i128();
+        let end = key.end.to_i128();
+
+        // Initialize loop variables
+        let mut max_stacked_deltas = 0;
+        let mut current_key = start;
+        let mut current_val = version.delta_coverage.query(start);
+
+        // Loop through the delta coverage and recurse on each part
+        for (change_key, change_val) in version.delta_coverage.range(start..end) {
+            // If there's a relevant delta in this part, add 1 and recurse down
+            if let Some(val) = current_val {
+                if val.get_lsn_range().end > lsn.start {
+                    let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
+                    let lr = lsn.start..val.get_lsn_range().start;
+                    if !kr.is_empty() {
+                        let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                        let new_limit = limit.map(|l| l - base_count);
+                        let max_stacked_deltas_underneath =
+                            self.count_deltas(&kr, &lr, new_limit)?;
+                        max_stacked_deltas = std::cmp::max(
+                            max_stacked_deltas,
+                            base_count + max_stacked_deltas_underneath,
+                        );
+                    }
+                }
             }
 
-            result += 1;
+            current_key = change_key;
+            current_val = change_val.clone();
         }
-        Ok(result)
+
+        // Consider the last part
+        if let Some(val) = current_val {
+            if val.get_lsn_range().end > lsn.start {
+                let kr = Key::from_i128(current_key)..Key::from_i128(end);
+                let lr = lsn.start..val.get_lsn_range().start;
+
+                if !kr.is_empty() {
+                    let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                    let new_limit = limit.map(|l| l - base_count);
+                    let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
+                    max_stacked_deltas = std::cmp::max(
+                        max_stacked_deltas,
+                        base_count + max_stacked_deltas_underneath,
+                    );
+                }
+            }
+        }
+
+        Ok(max_stacked_deltas)
+    }
+
+    /// Count how many reimage-worthy layers we need to visit for given key-lsn pair.
+    ///
+    /// The `partition_range` argument is used as context for the reimage-worthiness decision.
+    ///
+    /// Used as a helper for correctness checks only. Performance not critical.
+    pub fn get_difficulty(&self, lsn: Lsn, key: Key, partition_range: &Range<Key>) -> usize {
+        match self.search(key, lsn) {
+            Some(search_result) => {
+                if search_result.layer.is_incremental() {
+                    (Self::is_reimage_worthy(&search_result.layer, partition_range) as usize)
+                        + self.get_difficulty(search_result.lsn_floor, key, partition_range)
+                } else {
+                    0
+                }
+            }
+            None => 0,
+        }
+    }
+
+    /// Used for correctness checking. Results are expected to be identical to
+    /// self.get_difficulty_map. Assumes self.search is correct.
+    pub fn get_difficulty_map_bruteforce(
+        &self,
+        lsn: Lsn,
+        partitioning: &KeyPartitioning,
+    ) -> Vec<usize> {
+        // Looking at the difficulty as a function of key, it could only increase
+        // when a delta layer starts or an image layer ends. Therefore it's sufficient
+        // to check the difficulties at:
+        // - the key.start for each non-empty part range
+        // - the key.start for each delta
+        // - the key.end for each image
+        let keys_iter: Box<dyn Iterator<Item = Key>> = {
+            let mut keys: Vec<Key> = self
+                .iter_historic_layers()
+                .map(|layer| {
+                    if layer.is_incremental() {
+                        layer.get_key_range().start
+                    } else {
+                        layer.get_key_range().end
+                    }
+                })
+                .collect();
+            keys.sort();
+            Box::new(keys.into_iter())
+        };
+        let mut keys_iter = keys_iter.peekable();
+
+        // Iter the partition and keys together and query all the necessary
+        // keys, computing the max difficulty for each part.
+        partitioning
+            .parts
+            .iter()
+            .map(|part| {
+                let mut difficulty = 0;
+                // Partition ranges are assumed to be sorted and disjoint
+                // TODO assert it
+                for range in &part.ranges {
+                    if !range.is_empty() {
+                        difficulty =
+                            std::cmp::max(difficulty, self.get_difficulty(lsn, range.start, range));
+                    }
+                    while let Some(key) = keys_iter.peek() {
+                        if key >= &range.end {
+                            break;
+                        }
+                        let key = keys_iter.next().unwrap();
+                        if key < range.start {
+                            continue;
+                        }
+                        difficulty =
+                            std::cmp::max(difficulty, self.get_difficulty(lsn, key, range));
+                    }
+                }
+                difficulty
+            })
+            .collect()
+    }
+
+    /// For each part of a keyspace partitioning, return the maximum number of layers
+    /// that would be needed for page reconstruction in that part at the given LSN.
+    ///
+    /// If `limit` is provided we don't try to count above that number.
+    ///
+    /// This method is used to decide where to create new image layers. Computing the
+    /// result for the entire partitioning at once allows this function to be more
+    /// efficient, and further optimization is possible by using iterators instead,
+    /// to allow early return.
+    ///
+    /// TODO actually use this method instead of count_deltas. Currently we only use
+    ///      it for benchmarks.
+    pub fn get_difficulty_map(
+        &self,
+        lsn: Lsn,
+        partitioning: &KeyPartitioning,
+        limit: Option<usize>,
+    ) -> Vec<usize> {
+        // TODO This is a naive implementation. Perf improvements to do:
+        // 1. Instead of calling self.image_coverage and self.count_deltas,
+        //    iterate the image and delta coverage only once.
+        partitioning
+            .parts
+            .iter()
+            .map(|part| {
+                let mut difficulty = 0;
+                for range in &part.ranges {
+                    if limit == Some(difficulty) {
+                        break;
+                    }
+                    for (img_range, last_img) in self
+                        .image_coverage(range, lsn)
+                        .expect("why would this err?")
+                    {
+                        if limit == Some(difficulty) {
+                            break;
+                        }
+                        let img_lsn = if let Some(last_img) = last_img {
+                            last_img.get_lsn_range().end
+                        } else {
+                            Lsn(0)
+                        };
+
+                        if img_lsn < lsn {
+                            let num_deltas = self
+                                .count_deltas(&img_range, &(img_lsn..lsn), limit)
+                                .expect("why would this err lol?");
+                            difficulty = std::cmp::max(difficulty, num_deltas);
+                        }
+                    }
+                }
+                difficulty
+            })
+            .collect()
     }
 
     /// Return all L0 delta layers
@@ -629,8 +668,8 @@ where
         }
 
         println!("historic_layers:");
-        for e in self.historic_layers.iter() {
-            e.layer.dump(verbose)?;
+        for layer in self.iter_historic_layers() {
+            layer.dump(verbose)?;
         }
         println!("End dump LayerMap");
         Ok(())
diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
new file mode 100644
index 0000000000..46821aef15
--- /dev/null
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -0,0 +1,583 @@
+use std::collections::BTreeMap;
+use std::ops::Range;
+
+use tracing::info;
+
+use super::layer_coverage::LayerCoverageTuple;
+
+/// Layers in this module are identified and indexed by this data.
+///
+/// This is a helper struct to enable sorting layers by lsn.start.
+///
+/// These three values are enough to uniquely identify a layer, since
+/// a layer is obligated to contain all contents within range, so two
+/// deltas (or images) with the same range have identical content.
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct LayerKey {
+    // TODO I use i128 and u64 because it was easy for prototyping,
+    //      testing, and benchmarking. If we can use the Lsn and Key
+    //      types without overhead that would be preferable.
+    pub key: Range<i128>,
+    pub lsn: Range<u64>,
+    pub is_image: bool,
+}
+
+impl PartialOrd for LayerKey {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for LayerKey {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        // NOTE we really care about comparing by lsn.start first
+        self.lsn
+            .start
+            .cmp(&other.lsn.start)
+            .then(self.lsn.end.cmp(&other.lsn.end))
+            .then(self.key.start.cmp(&other.key.start))
+            .then(self.key.end.cmp(&other.key.end))
+            .then(self.is_image.cmp(&other.is_image))
+    }
+}
+
+/// Efficiently queryable layer coverage for each LSN.
+///
+/// Allows answering layer map queries very efficiently,
+/// but doesn't allow retroactive insertion, which is
+/// sometimes necessary. See BufferedHistoricLayerCoverage.
+pub struct HistoricLayerCoverage<Value> {
+    /// The latest state
+    head: LayerCoverageTuple<Value>,
+
+    /// All previous states
+    historic: BTreeMap<u64, LayerCoverageTuple<Value>>,
+}
+
+impl<T: Clone> Default for HistoricLayerCoverage<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<Value: Clone> HistoricLayerCoverage<Value> {
+    pub fn new() -> Self {
+        Self {
+            head: LayerCoverageTuple::default(),
+            historic: BTreeMap::default(),
+        }
+    }
+
+    /// Add a layer
+    ///
+    /// Panics if new layer has older lsn.start than an existing layer.
+    /// See BufferedHistoricLayerCoverage for a more general insertion method.
+    pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
+        // It's only a persistent map, not a retroactive one
+        if let Some(last_entry) = self.historic.iter().next_back() {
+            let last_lsn = last_entry.0;
+            if layer_key.lsn.start < *last_lsn {
+                panic!("unexpected retroactive insert");
+            }
+        }
+
+        // Insert into data structure
+        if layer_key.is_image {
+            self.head
+                .image_coverage
+                .insert(layer_key.key, layer_key.lsn.clone(), value);
+        } else {
+            self.head
+                .delta_coverage
+                .insert(layer_key.key, layer_key.lsn.clone(), value);
+        }
+
+        // Remember history. Clone is O(1)
+        self.historic.insert(layer_key.lsn.start, self.head.clone());
+    }
+
+    /// Query at a particular LSN, inclusive
+    pub fn get_version(&self, lsn: u64) -> Option<&LayerCoverageTuple<Value>> {
+        match self.historic.range(..=lsn).next_back() {
+            Some((_, v)) => Some(v),
+            None => None,
+        }
+    }
+
+    /// Remove all entries after a certain LSN (inclusive)
+    pub fn trim(&mut self, begin: &u64) {
+        self.historic.split_off(begin);
+        self.head = self
+            .historic
+            .iter()
+            .rev()
+            .next()
+            .map(|(_, v)| v.clone())
+            .unwrap_or_default();
+    }
+}
+
+/// This is the most basic test that demonstrates intended usage.
+/// All layers in this test have height 1.
+#[test]
+fn test_persistent_simple() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 100..101,
+            is_image: true,
+        },
+        "Layer 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 3..9,
+            lsn: 110..111,
+            is_image: true,
+        },
+        "Layer 2".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 5..6,
+            lsn: 120..121,
+            is_image: true,
+        },
+        "Layer 3".to_string(),
+    );
+
+    // After Layer 1 insertion
+    let version = map.get_version(105).unwrap();
+    assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+
+    // After Layer 2 insertion
+    let version = map.get_version(115).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(11), None);
+
+    // After Layer 3 insertion
+    let version = map.get_version(125).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 3".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 2".to_string()));
+}
+
+/// Cover simple off-by-one edge cases
+#[test]
+fn test_off_by_one() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+    map.insert(
+        LayerKey {
+            key: 3..5,
+            lsn: 100..110,
+            is_image: true,
+        },
+        "Layer 1".to_string(),
+    );
+
+    // Check different LSNs
+    let version = map.get_version(99);
+    assert!(version.is_none());
+    let version = map.get_version(100).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+    let version = map.get_version(110).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+
+    // Check different keys
+    let version = map.get_version(105).unwrap();
+    assert_eq!(version.image_coverage.query(2), None);
+    assert_eq!(version.image_coverage.query(3), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(5), None);
+}
+
+/// Cover edge cases where layers begin or end on the same key
+#[test]
+fn test_key_collision() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+
+    map.insert(
+        LayerKey {
+            key: 3..5,
+            lsn: 100..110,
+            is_image: true,
+        },
+        "Layer 10".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 5..8,
+            lsn: 100..110,
+            is_image: true,
+        },
+        "Layer 11".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 3..4,
+            lsn: 200..210,
+            is_image: true,
+        },
+        "Layer 20".to_string(),
+    );
+
+    // Check after layer 11
+    let version = map.get_version(105).unwrap();
+    assert_eq!(version.image_coverage.query(2), None);
+    assert_eq!(
+        version.image_coverage.query(3),
+        Some("Layer 10".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(5),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(7),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(version.image_coverage.query(8), None);
+
+    // Check after layer 20
+    let version = map.get_version(205).unwrap();
+    assert_eq!(version.image_coverage.query(2), None);
+    assert_eq!(
+        version.image_coverage.query(3),
+        Some("Layer 20".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(5),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(
+        version.image_coverage.query(7),
+        Some("Layer 11".to_string())
+    );
+    assert_eq!(version.image_coverage.query(8), None);
+}
+
+/// Test when rectangles have nontrivial height and possibly overlap
+#[test]
+fn test_persistent_overlapping() {
+    let mut map = HistoricLayerCoverage::<String>::new();
+
+    // Add 3 key-disjoint layers with varying LSN ranges
+    map.insert(
+        LayerKey {
+            key: 1..2,
+            lsn: 100..200,
+            is_image: true,
+        },
+        "Layer 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 4..5,
+            lsn: 110..200,
+            is_image: true,
+        },
+        "Layer 2".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 7..8,
+            lsn: 120..300,
+            is_image: true,
+        },
+        "Layer 3".to_string(),
+    );
+
+    // Add wide and short layer
+    map.insert(
+        LayerKey {
+            key: 0..9,
+            lsn: 130..199,
+            is_image: true,
+        },
+        "Layer 4".to_string(),
+    );
+
+    // Add wide layer taller than some
+    map.insert(
+        LayerKey {
+            key: 0..9,
+            lsn: 140..201,
+            is_image: true,
+        },
+        "Layer 5".to_string(),
+    );
+
+    // Add wide layer taller than all
+    map.insert(
+        LayerKey {
+            key: 0..9,
+            lsn: 150..301,
+            is_image: true,
+        },
+        "Layer 6".to_string(),
+    );
+
+    // After layer 4 insertion
+    let version = map.get_version(135).unwrap();
+    assert_eq!(version.image_coverage.query(0), Some("Layer 4".to_string()));
+    assert_eq!(version.image_coverage.query(1), Some("Layer 1".to_string()));
+    assert_eq!(version.image_coverage.query(2), Some("Layer 4".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 2".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 4".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 4".to_string()));
+
+    // After layer 5 insertion
+    let version = map.get_version(145).unwrap();
+    assert_eq!(version.image_coverage.query(0), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(1), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(2), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 5".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 3".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 5".to_string()));
+
+    // After layer 6 insertion
+    let version = map.get_version(155).unwrap();
+    assert_eq!(version.image_coverage.query(0), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(1), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(2), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(4), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(5), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(7), Some("Layer 6".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Layer 6".to_string()));
+}
+
+/// Wrapper for HistoricLayerCoverage that allows us to hack around the lack
+/// of support for retroactive insertion by rebuilding the map since the
+/// change.
+///
+/// Why is this needed? We most often insert new layers with newer LSNs,
+/// but during compaction we create layers with non-latest LSN, and during
+/// GC we delete historic layers.
+///
+/// Even though rebuilding is an expensive (N log N) solution to the problem,
+/// it's not critical since we do something equally expensive just to decide
+/// whether or not to create new image layers.
+/// TODO It's not expensive but it's not great to hold a layer map write lock
+///      for that long.
+///
+/// If this becomes an actual bottleneck, one solution would be to build a
+/// segment tree that holds PersistentLayerMaps. Though this would mean that
+/// we take an additional log(N) performance hit for queries, which will probably
+/// still be more critical.
+///
+/// See this for more on persistent and retroactive techniques:
+/// https://www.youtube.com/watch?v=WqCWghETNDc&t=581s
+pub struct BufferedHistoricLayerCoverage<Value> {
+    /// A persistent layer map that we rebuild when we need to retroactively update
+    historic_coverage: HistoricLayerCoverage<Value>,
+
+    /// We buffer insertion into the PersistentLayerMap to decrease the number of rebuilds.
+    buffer: BTreeMap<LayerKey, Option<Value>>,
+
+    /// All current layers. This is not used for search. Only to make rebuilds easier.
+    layers: BTreeMap<LayerKey, Value>,
+}
+
+impl<T: std::fmt::Debug> std::fmt::Debug for BufferedHistoricLayerCoverage<T> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("RetroactiveLayerMap")
+            .field("buffer", &self.buffer)
+            .field("layers", &self.layers)
+            .finish()
+    }
+}
+
+impl<T: Clone> Default for BufferedHistoricLayerCoverage<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
+    pub fn new() -> Self {
+        Self {
+            historic_coverage: HistoricLayerCoverage::<Value>::new(),
+            buffer: BTreeMap::new(),
+            layers: BTreeMap::new(),
+        }
+    }
+
+    pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
+        self.buffer.insert(layer_key, Some(value));
+    }
+
+    pub fn remove(&mut self, layer_key: LayerKey) {
+        self.buffer.insert(layer_key, None);
+    }
+
+    pub fn rebuild(&mut self) {
+        // Find the first LSN that needs to be rebuilt
+        let rebuild_since: u64 = match self.buffer.iter().next() {
+            Some((LayerKey { lsn, .. }, _)) => lsn.start,
+            None => return, // No need to rebuild if buffer is empty
+        };
+
+        // Apply buffered updates to self.layers
+        let num_updates = self.buffer.len();
+        self.buffer.retain(|layer_key, layer| {
+            match layer {
+                Some(l) => {
+                    self.layers.insert(layer_key.clone(), l.clone());
+                }
+                None => {
+                    self.layers.remove(layer_key);
+                }
+            };
+            false
+        });
+
+        // Rebuild
+        let mut num_inserted = 0;
+        self.historic_coverage.trim(&rebuild_since);
+        for (layer_key, layer) in self.layers.range(
+            LayerKey {
+                lsn: rebuild_since..0,
+                key: 0..0,
+                is_image: false,
+            }..,
+        ) {
+            self.historic_coverage
+                .insert(layer_key.clone(), layer.clone());
+            num_inserted += 1;
+        }
+
+        // TODO maybe only warn if ratio is at least 10
+        info!(
+            "Rebuilt layer map. Did {} insertions to process a batch of {} updates.",
+            num_inserted, num_updates,
+        )
+    }
+
+    /// Iterate all the layers
+    pub fn iter(&self) -> impl '_ + Iterator<Item = Value> {
+        // NOTE we can actually perform this without rebuilding,
+        //      but it's not necessary for now.
+        if !self.buffer.is_empty() {
+            panic!("rebuild pls")
+        }
+
+        self.layers.values().cloned()
+    }
+
+    /// Return a reference to a queryable map, assuming all updates
+    /// have already been processed using self.rebuild()
+    pub fn get(&self) -> anyhow::Result<&HistoricLayerCoverage<Value>> {
+        // NOTE we error here instead of implicitly rebuilding because
+        //      rebuilding is somewhat expensive.
+        // TODO maybe implicitly rebuild and log/sentry an error?
+        if !self.buffer.is_empty() {
+            anyhow::bail!("rebuild required")
+        }
+
+        Ok(&self.historic_coverage)
+    }
+}
+
+#[test]
+fn test_retroactive_regression_1() {
+    let mut map = BufferedHistoricLayerCoverage::new();
+
+    map.insert(
+        LayerKey {
+            key: 0..21267647932558653966460912964485513215,
+            lsn: 23761336..23761457,
+            is_image: false,
+        },
+        "sdfsdfs".to_string(),
+    );
+
+    map.rebuild();
+
+    let version = map.get().unwrap().get_version(23761457).unwrap();
+    assert_eq!(
+        version.delta_coverage.query(100),
+        Some("sdfsdfs".to_string())
+    );
+}
+
+#[test]
+fn test_retroactive_simple() {
+    let mut map = BufferedHistoricLayerCoverage::new();
+
+    // Append some images in increasing LSN order
+    map.insert(
+        LayerKey {
+            key: 0..5,
+            lsn: 100..101,
+            is_image: true,
+        },
+        "Image 1".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 3..9,
+            lsn: 110..111,
+            is_image: true,
+        },
+        "Image 2".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 4..6,
+            lsn: 120..121,
+            is_image: true,
+        },
+        "Image 3".to_string(),
+    );
+    map.insert(
+        LayerKey {
+            key: 8..9,
+            lsn: 120..121,
+            is_image: true,
+        },
+        "Image 4".to_string(),
+    );
+
+    // Add a delta layer out of order
+    map.insert(
+        LayerKey {
+            key: 2..5,
+            lsn: 105..106,
+            is_image: true,
+        },
+        "Delta 1".to_string(),
+    );
+
+    // Rebuild so we can start querying
+    map.rebuild();
+
+    // Query key 4
+    let version = map.get().unwrap().get_version(90);
+    assert!(version.is_none());
+    let version = map.get().unwrap().get_version(102).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 1".to_string()));
+    let version = map.get().unwrap().get_version(107).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Delta 1".to_string()));
+    let version = map.get().unwrap().get_version(115).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 2".to_string()));
+    let version = map.get().unwrap().get_version(125).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 3".to_string()));
+
+    // Remove Image 3
+    map.remove(LayerKey {
+        key: 4..6,
+        lsn: 120..121,
+        is_image: true,
+    });
+    map.rebuild();
+
+    // Check deletion worked
+    let version = map.get().unwrap().get_version(125).unwrap();
+    assert_eq!(version.image_coverage.query(4), Some("Image 2".to_string()));
+    assert_eq!(version.image_coverage.query(8), Some("Image 4".to_string()));
+}
diff --git a/pageserver/src/tenant/layer_map/layer_coverage.rs b/pageserver/src/tenant/layer_map/layer_coverage.rs
new file mode 100644
index 0000000000..4e3b4516dc
--- /dev/null
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -0,0 +1,154 @@
+use std::ops::Range;
+
+// TODO the `im` crate has 20x more downloads and also has
+// persistent/immutable BTree. It also runs a bit faster but
+// results are not the same on some tests.
+use rpds::RedBlackTreeMapSync;
+
+/// Data structure that can efficiently:
+/// - find the latest layer by lsn.end at a given key
+/// - iterate the latest layers in a key range
+/// - insert layers in non-decreasing lsn.start order
+///
+/// The struct is parameterized over Value for easier
+/// testing, but in practice it's some sort of layer.
+pub struct LayerCoverage<Value> {
+    /// For every change in coverage (as we sweep the key space)
+    /// we store (lsn.end, value).
+    ///
+    /// We use an immutable/persistent tree so that we can keep historic
+    /// versions of this coverage without cloning the whole thing and
+    /// incurring quadratic memory cost. See HistoricLayerCoverage.
+    ///
+    /// We use the Sync version of the map because we want Self to
+    /// be Sync. Using nonsync might be faster, if we can work with
+    /// that.
+    nodes: RedBlackTreeMapSync<i128, Option<(u64, Value)>>,
+}
+
+impl<T: Clone> Default for LayerCoverage<T> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl<Value: Clone> LayerCoverage<Value> {
+    pub fn new() -> Self {
+        Self {
+            nodes: RedBlackTreeMapSync::default(),
+        }
+    }
+
+    /// Helper function to subdivide the key range without changing any values
+    ///
+    /// Complexity: O(log N)
+    fn add_node(&mut self, key: i128) {
+        let value = match self.nodes.range(..=key).last() {
+            Some((_, Some(v))) => Some(v.clone()),
+            Some((_, None)) => None,
+            None => None,
+        };
+        self.nodes.insert_mut(key, value);
+    }
+
+    /// Insert a layer.
+    ///
+    /// Complexity: worst case O(N), in practice O(log N). See NOTE in implementation.
+    pub fn insert(&mut self, key: Range<i128>, lsn: Range<u64>, value: Value) {
+        // Add nodes at endpoints
+        //
+        // NOTE The order of lines is important. We add nodes at the start
+        // and end of the key range **before updating any nodes** in order
+        // to pin down the current coverage outside of the relevant key range.
+        // Only the coverage inside the layer's key range should change.
+        self.add_node(key.start);
+        self.add_node(key.end);
+
+        // Raise the height where necessary
+        //
+        // NOTE This loop is worst case O(N), but amortized O(log N) in the special
+        // case when rectangles have no height. In practice I don't think we'll see
+        // the kind of layer intersections needed to trigger O(N) behavior. The worst
+        // case is N/2 horizontal layers overlapped with N/2 vertical layers in a
+        // grid pattern.
+        let mut to_update = Vec::new();
+        let mut to_remove = Vec::new();
+        let mut prev_covered = false;
+        for (k, node) in self.nodes.range(key.clone()) {
+            let needs_cover = match node {
+                None => true,
+                Some((h, _)) => h < &lsn.end,
+            };
+            if needs_cover {
+                match prev_covered {
+                    true => to_remove.push(*k),
+                    false => to_update.push(*k),
+                }
+            }
+            prev_covered = needs_cover;
+        }
+        if !prev_covered {
+            to_remove.push(key.end);
+        }
+        for k in to_update {
+            self.nodes.insert_mut(k, Some((lsn.end, value.clone())));
+        }
+        for k in to_remove {
+            self.nodes.remove_mut(&k);
+        }
+    }
+
+    /// Get the latest (by lsn.end) layer at a given key
+    ///
+    /// Complexity: O(log N)
+    pub fn query(&self, key: i128) -> Option<Value> {
+        self.nodes
+            .range(..=key)
+            .rev()
+            .next()?
+            .1
+            .as_ref()
+            .map(|(_, v)| v.clone())
+    }
+
+    /// Iterate the changes in layer coverage in a given range. You will likely
+    /// want to start with self.query(key.start), and then follow up with self.range
+    ///
+    /// Complexity: O(log N + result_size)
+    pub fn range(&self, key: Range<i128>) -> impl '_ + Iterator<Item = (i128, Option<Value>)> {
+        self.nodes
+            .range(key)
+            .map(|(k, v)| (*k, v.as_ref().map(|x| x.1.clone())))
+    }
+
+    /// O(1) clone
+    pub fn clone(&self) -> Self {
+        Self {
+            nodes: self.nodes.clone(),
+        }
+    }
+}
+
+/// Image and delta coverage at a specific LSN.
+pub struct LayerCoverageTuple<Value> {
+    pub image_coverage: LayerCoverage<Value>,
+    pub delta_coverage: LayerCoverage<Value>,
+}
+
+impl<T: Clone> Default for LayerCoverageTuple<T> {
+    fn default() -> Self {
+        Self {
+            image_coverage: LayerCoverage::default(),
+            delta_coverage: LayerCoverage::default(),
+        }
+    }
+}
+
+impl<Value: Clone> LayerCoverageTuple<Value> {
+    pub fn clone(&self) -> Self {
+        Self {
+            image_coverage: self.image_coverage.clone(),
+            delta_coverage: self.delta_coverage.clone(),
+        }
+    }
+}
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index dce7cd8bae..a9edee3794 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -16,6 +16,7 @@ use remote_storage::GenericRemoteStorage;
 use utils::crashsafe;
 
 use crate::config::PageServerConf;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{Tenant, TenantState};
@@ -24,8 +25,35 @@ use crate::IGNORED_TENANT_FILE_NAME;
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};
 
-static TENANTS: Lazy<RwLock<HashMap<TenantId, Arc<Tenant>>>> =
-    Lazy::new(|| RwLock::new(HashMap::new()));
+/// The tenants known to the pageserver.
+/// The enum variants are used to distinguish the different states that the pageserver can be in.
+enum TenantsMap {
+    /// [`init_tenant_mgr`] is not done yet.
+    Initializing,
+    /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
+    /// New tenants can be added using [`tenant_map_insert`].
+    Open(HashMap<TenantId, Arc<Tenant>>),
+    /// The pageserver has entered shutdown mode via [`shutdown_all_tenants`].
+    /// Existing tenants are still accessible, but no new tenants can be created.
+    ShuttingDown(HashMap<TenantId, Arc<Tenant>>),
+}
+
+impl TenantsMap {
+    fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
+        match self {
+            TenantsMap::Initializing => None,
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id),
+        }
+    }
+    fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
+        match self {
+            TenantsMap::Initializing => None,
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
+        }
+    }
+}
+
+static TENANTS: Lazy<RwLock<TenantsMap>> = Lazy::new(|| RwLock::new(TenantsMap::Initializing));
 
 /// Initialize repositories with locally available timelines.
 /// Timelines that are only partially available locally (remote storage has more data than this pageserver)
@@ -36,13 +64,16 @@ pub async fn init_tenant_mgr(
     remote_storage: Option<GenericRemoteStorage>,
 ) -> anyhow::Result<()> {
     // Scan local filesystem for attached tenants
-    let mut number_of_tenants = 0;
     let tenants_dir = conf.tenants_path();
 
+    let mut tenants = HashMap::new();
+
     let mut dir_entries = fs::read_dir(&tenants_dir)
         .await
         .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
 
+    let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
+
     loop {
         match dir_entries.next_entry().await {
             Ok(None) => break,
@@ -86,10 +117,10 @@ pub async fn init_tenant_mgr(
                         conf,
                         &tenant_dir_path,
                         remote_storage.clone(),
+                        &ctx,
                     ) {
                         Ok(tenant) => {
-                            TENANTS.write().await.insert(tenant.tenant_id(), tenant);
-                            number_of_tenants += 1;
+                            tenants.insert(tenant.tenant_id(), tenant);
                         }
                         Err(e) => {
                             error!("Failed to collect tenant files from dir {tenants_dir:?} for entry {dir_entry:?}, reason: {e:#}");
@@ -108,7 +139,11 @@ pub async fn init_tenant_mgr(
         }
     }
 
-    info!("Processed {number_of_tenants} local tenants at startup");
+    info!("Processed {} local tenants at startup", tenants.len());
+
+    let mut tenants_map = TENANTS.write().await;
+    assert!(matches!(&*tenants_map, &TenantsMap::Initializing));
+    *tenants_map = TenantsMap::Open(tenants);
     Ok(())
 }
 
@@ -116,6 +151,7 @@ pub fn schedule_local_tenant_processing(
     conf: &'static PageServerConf,
     tenant_path: &Path,
     remote_storage: Option<GenericRemoteStorage>,
+    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
     anyhow::ensure!(
         tenant_path.is_dir(),
@@ -150,7 +186,7 @@ pub fn schedule_local_tenant_processing(
     let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
         info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
         if let Some(remote_storage) = remote_storage {
-            Tenant::spawn_attach(conf, tenant_id, remote_storage)
+            Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx)
         } else {
             warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
             Tenant::create_broken_tenant(conf, tenant_id)
@@ -158,7 +194,7 @@ pub fn schedule_local_tenant_processing(
     } else {
         info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
         // Start loading the tenant into memory. It will initially be in Loading state.
-        Tenant::spawn_load(conf, tenant_id, remote_storage)
+        Tenant::spawn_load(conf, tenant_id, remote_storage, ctx)
     };
     Ok(tenant)
 }
@@ -166,21 +202,44 @@ pub fn schedule_local_tenant_processing(
 ///
 /// Shut down all tenants. This runs as part of pageserver shutdown.
 ///
+/// NB: We leave the tenants in the map, so that they remain accessible through
+/// the management API until we shut it down. If we removed the shut-down tenants
+/// from the tenants map, the management API would return 404 for these tenants,
+/// because TenantsMap::get() now returns `None`.
+/// That could be easily misinterpreted by control plane, the consumer of the
+/// management API. For example, it could attach the tenant on a different pageserver.
+/// We would then be in split-brain once this pageserver restarts.
 pub async fn shutdown_all_tenants() {
+    // Prevent new tenants from being created.
     let tenants_to_shut_down = {
         let mut m = TENANTS.write().await;
-        let mut tenants_to_shut_down = Vec::with_capacity(m.len());
-        for (_, tenant) in m.drain() {
-            if tenant.is_active() {
-                // updates tenant state, forbidding new GC and compaction iterations from starting
-                tenant.set_stopping();
-                tenants_to_shut_down.push(tenant)
+        match &mut *m {
+            TenantsMap::Initializing => {
+                *m = TenantsMap::ShuttingDown(HashMap::default());
+                info!("tenants map is empty");
+                return;
+            }
+            TenantsMap::Open(tenants) => {
+                let tenants_clone = tenants.clone();
+                *m = TenantsMap::ShuttingDown(std::mem::take(tenants));
+                tenants_clone
+            }
+            TenantsMap::ShuttingDown(_) => {
+                error!("already shutting down, this function isn't supposed to be called more than once");
+                return;
             }
         }
-        drop(m);
-        tenants_to_shut_down
     };
 
+    let mut tenants_to_freeze_and_flush = Vec::with_capacity(tenants_to_shut_down.len());
+    for (_, tenant) in tenants_to_shut_down {
+        if tenant.is_active() {
+            // updates tenant state, forbidding new GC and compaction iterations from starting
+            tenant.set_stopping();
+            tenants_to_freeze_and_flush.push(tenant);
+        }
+    }
+
     // Shut down all existing walreceiver connections and stop accepting the new ones.
     task_mgr::shutdown_tasks(Some(TaskKind::WalReceiverManager), None, None).await;
 
@@ -192,7 +251,7 @@ pub async fn shutdown_all_tenants() {
     // should be no more activity in any of the repositories.
     //
     // On error, log it but continue with the shutdown for other tenants.
-    for tenant in tenants_to_shut_down {
+    for tenant in tenants_to_freeze_and_flush {
         let tenant_id = tenant.tenant_id();
         debug!("shutdown tenant {tenant_id}");
 
@@ -207,27 +266,23 @@ pub async fn create_tenant(
     tenant_conf: TenantConfOpt,
     tenant_id: TenantId,
     remote_storage: Option<GenericRemoteStorage>,
-) -> anyhow::Result<Option<Arc<Tenant>>> {
-    match TENANTS.write().await.entry(tenant_id) {
-        hash_map::Entry::Occupied(_) => {
-            debug!("tenant {tenant_id} already exists");
-            Ok(None)
-        }
-        hash_map::Entry::Vacant(v) => {
-            // Hold the write_tenants() lock, since all of this is local IO.
-            // If this section ever becomes contentious, introduce a new `TenantState::Creating`.
-            let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
-            let created_tenant =
-                schedule_local_tenant_processing(conf, &tenant_directory, remote_storage)?;
-            let crated_tenant_id = created_tenant.tenant_id();
-            anyhow::ensure!(
+    ctx: &RequestContext,
+) -> Result<Arc<Tenant>, TenantMapInsertError> {
+    tenant_map_insert(tenant_id, |vacant_entry| {
+        // We're holding the tenants lock in write mode while doing local IO.
+        // If this section ever becomes contentious, introduce a new `TenantState::Creating`
+        // and do the work in that state.
+        let tenant_directory = super::create_tenant_files(conf, tenant_conf, tenant_id)?;
+        let created_tenant =
+            schedule_local_tenant_processing(conf, &tenant_directory, remote_storage, ctx)?;
+        let crated_tenant_id = created_tenant.tenant_id();
+        anyhow::ensure!(
                 tenant_id == crated_tenant_id,
                 "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {crated_tenant_id})",
             );
-            v.insert(Arc::clone(&created_tenant));
-            Ok(Some(created_tenant))
-        }
-    }
+        vacant_entry.insert(Arc::clone(&created_tenant));
+        Ok(created_tenant)
+    }).await
 }
 
 pub async fn update_tenant_config(
@@ -236,10 +291,11 @@ pub async fn update_tenant_config(
     tenant_id: TenantId,
 ) -> anyhow::Result<()> {
     info!("configuring tenant {tenant_id}");
-    get_tenant(tenant_id, true)
-        .await?
-        .update_tenant_config(tenant_conf);
-    Tenant::persist_tenant_config(&conf.tenant_config_path(tenant_id), tenant_conf, false)?;
+    let tenant = get_tenant(tenant_id, true).await?;
+
+    tenant.update_tenant_config(tenant_conf);
+    let tenant_config_path = conf.tenant_config_path(tenant_id);
+    Tenant::persist_tenant_config(&tenant.tenant_id(), &tenant_config_path, tenant_conf, false)?;
     Ok(())
 }
 
@@ -260,10 +316,14 @@ pub async fn get_tenant(tenant_id: TenantId, active_only: bool) -> anyhow::Resul
     }
 }
 
-pub async fn delete_timeline(tenant_id: TenantId, timeline_id: TimelineId) -> anyhow::Result<()> {
+pub async fn delete_timeline(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ctx: &RequestContext,
+) -> anyhow::Result<()> {
     match get_tenant(tenant_id, true).await {
         Ok(tenant) => {
-            tenant.delete_timeline(timeline_id).await?;
+            tenant.delete_timeline(timeline_id, ctx).await?;
         }
         Err(e) => anyhow::bail!("Cannot access tenant {tenant_id} in local tenant state: {e:?}"),
     }
@@ -291,8 +351,9 @@ pub async fn load_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,
     remote_storage: Option<GenericRemoteStorage>,
-) -> anyhow::Result<()> {
-    run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
+    ctx: &RequestContext,
+) -> Result<(), TenantMapInsertError> {
+    tenant_map_insert(tenant_id, |vacant_entry| {
         let tenant_path = conf.tenant_path(&tenant_id);
         let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(tenant_id);
         if tenant_ignore_mark.exists() {
@@ -300,7 +361,7 @@ pub async fn load_tenant(
                 .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
         }
 
-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, remote_storage, ctx)
             .with_context(|| {
                 format!("Failed to schedule tenant processing in path {tenant_path:?}")
             })?;
@@ -329,16 +390,24 @@ pub async fn ignore_tenant(
     .await
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum TenantMapListError {
+    #[error("tenant map is still initiailizing")]
+    Initializing,
+}
+
 ///
 /// Get list of tenants, for the mgmt API
 ///
-pub async fn list_tenants() -> Vec<(TenantId, TenantState)> {
-    TENANTS
-        .read()
-        .await
-        .iter()
+pub async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapListError> {
+    let tenants = TENANTS.read().await;
+    let m = match &*tenants {
+        TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
+        TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m,
+    };
+    Ok(m.iter()
         .map(|(id, tenant)| (*id, tenant.current_state()))
-        .collect()
+        .collect())
 }
 
 /// Execute Attach mgmt API command.
@@ -349,34 +418,62 @@ pub async fn attach_tenant(
     conf: &'static PageServerConf,
     tenant_id: TenantId,
     remote_storage: GenericRemoteStorage,
-) -> anyhow::Result<()> {
-    run_if_no_tenant_in_memory(tenant_id, |vacant_entry| {
+    ctx: &RequestContext,
+) -> Result<(), TenantMapInsertError> {
+    tenant_map_insert(tenant_id, |vacant_entry| {
         let tenant_path = conf.tenant_path(&tenant_id);
         anyhow::ensure!(
             !tenant_path.exists(),
             "Cannot attach tenant {tenant_id}, local tenant directory already exists"
         );
 
-        let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage);
+        let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx);
         vacant_entry.insert(tenant);
-
         Ok(())
     })
     .await
 }
 
-async fn run_if_no_tenant_in_memory<F, V>(tenant_id: TenantId, run: F) -> anyhow::Result<V>
+#[derive(Debug, thiserror::Error)]
+pub enum TenantMapInsertError {
+    #[error("tenant map is still initializing")]
+    StillInitializing,
+    #[error("tenant map is shutting down")]
+    ShuttingDown,
+    #[error("tenant {0} already exists, state: {1:?}")]
+    TenantAlreadyExists(TenantId, TenantState),
+    #[error(transparent)]
+    Closure(#[from] anyhow::Error),
+}
+
+/// Give the given closure access to the tenants map entry for the given `tenant_id`, iff that
+/// entry is vacant. The closure is responsible for creating the tenant object and inserting
+/// it into the tenants map through the vacnt entry that it receives as argument.
+///
+/// NB: the closure should return quickly because the current implementation of tenants map
+/// serializes access through an `RwLock`.
+async fn tenant_map_insert<F, V>(
+    tenant_id: TenantId,
+    insert_fn: F,
+) -> Result<V, TenantMapInsertError>
 where
     F: FnOnce(hash_map::VacantEntry<TenantId, Arc<Tenant>>) -> anyhow::Result<V>,
 {
-    match TENANTS.write().await.entry(tenant_id) {
-        hash_map::Entry::Occupied(e) => {
-            anyhow::bail!(
-                "tenant {tenant_id} already exists, state: {:?}",
-                e.get().current_state()
-            )
-        }
-        hash_map::Entry::Vacant(v) => run(v),
+    let mut guard = TENANTS.write().await;
+    let m = match &mut *guard {
+        TenantsMap::Initializing => return Err(TenantMapInsertError::StillInitializing),
+        TenantsMap::ShuttingDown(_) => return Err(TenantMapInsertError::ShuttingDown),
+        TenantsMap::Open(m) => m,
+    };
+    match m.entry(tenant_id) {
+        hash_map::Entry::Occupied(e) => Err(TenantMapInsertError::TenantAlreadyExists(
+            tenant_id,
+            e.get().current_state(),
+        )),
+        hash_map::Entry::Vacant(v) => match insert_fn(v) {
+            Ok(v) => Ok(v),
+            Err(e) => Err(TenantMapInsertError::Closure(e)),
+        },
     }
 }
 
@@ -449,9 +546,9 @@ pub async fn immediate_gc(
     tenant_id: TenantId,
     timeline_id: TimelineId,
     gc_req: TimelineGcRequest,
+    ctx: &RequestContext,
 ) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
     let guard = TENANTS.read().await;
-
     let tenant = guard
         .get(&tenant_id)
         .map(Arc::clone)
@@ -462,7 +559,8 @@ pub async fn immediate_gc(
     // Use tenant's pitr setting
     let pitr = tenant.get_pitr_interval();
 
-    // Run in task_mgr to avoid race with detach operation
+    // Run in task_mgr to avoid race with tenant_detach operation
+    let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
     let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
     task_mgr::spawn(
         &tokio::runtime::Handle::current(),
@@ -474,7 +572,7 @@ pub async fn immediate_gc(
         async move {
             fail::fail_point!("immediate_gc_task_pre");
             let result = tenant
-                .gc_iteration(Some(timeline_id), gc_horizon, pitr)
+                .gc_iteration(Some(timeline_id), gc_horizon, pitr, &ctx)
                 .instrument(info_span!("manual_gc", tenant = %tenant_id, timeline = %timeline_id))
                 .await;
                 // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
@@ -497,6 +595,7 @@ pub async fn immediate_gc(
 pub async fn immediate_compact(
     tenant_id: TenantId,
     timeline_id: TimelineId,
+    ctx: &RequestContext,
 ) -> Result<tokio::sync::oneshot::Receiver<anyhow::Result<()>>, ApiError> {
     let guard = TENANTS.read().await;
 
@@ -510,7 +609,8 @@ pub async fn immediate_compact(
         .get_timeline(timeline_id, true)
         .map_err(ApiError::NotFound)?;
 
-    // Run in task_mgr to avoid race with detach operation
+    // Run in task_mgr to avoid race with tenant_detach operation
+    let ctx = ctx.detached_child(TaskKind::Compaction, DownloadBehavior::Download);
     let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
     task_mgr::spawn(
         &tokio::runtime::Handle::current(),
@@ -523,7 +623,7 @@ pub async fn immediate_compact(
         false,
         async move {
             let result = timeline
-                .compact()
+                .compact(&ctx)
                 .instrument(
                     info_span!("manual_compact", tenant = %tenant_id, timeline = %timeline_id),
                 )
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 013591caee..3f69017160 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1010,7 +1010,10 @@ impl RemoteTimelineClient {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::tenant::harness::{TenantHarness, TIMELINE_ID};
+    use crate::{
+        tenant::harness::{TenantHarness, TIMELINE_ID},
+        DEFAULT_PG_VERSION,
+    };
     use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
     use std::{collections::HashSet, path::Path};
     use utils::lsn::Lsn;
@@ -1064,9 +1067,19 @@ mod tests {
     // Test scheduling
     #[test]
     fn upload_scheduling() -> anyhow::Result<()> {
+        // Use a current-thread runtime in the test
+        let runtime = Box::leak(Box::new(
+            tokio::runtime::Builder::new_current_thread()
+                .enable_all()
+                .build()?,
+        ));
+        let _entered = runtime.enter();
+
         let harness = TenantHarness::create("upload_scheduling")?;
+        let (tenant, ctx) = runtime.block_on(harness.load());
+        let _timeline =
+            tenant.create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)?;
         let timeline_path = harness.timeline_path(&TIMELINE_ID);
-        std::fs::create_dir_all(&timeline_path)?;
 
         let remote_fs_dir = harness.conf.workdir.join("remote_fs");
         std::fs::create_dir_all(remote_fs_dir)?;
@@ -1084,14 +1097,6 @@ mod tests {
             storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
         };
 
-        // Use a current-thread runtime in the test
-        let runtime = Box::leak(Box::new(
-            tokio::runtime::Builder::new_current_thread()
-                .enable_all()
-                .build()?,
-        ));
-        let _entered = runtime.enter();
-
         // Test outline:
         //
         // Schedule upload of a bunch of layers. Check that they are started immediately, not queued
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 61cb32fc76..2fed4f88b3 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -6,6 +6,7 @@ use anyhow::Context;
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 
+use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 
 use super::Tenant;
@@ -181,6 +182,7 @@ pub(super) async fn gather_inputs(
     tenant: &Tenant,
     limit: &Arc<Semaphore>,
     logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
+    ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
     // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
     // our advantage with `?` error handling.
@@ -188,7 +190,7 @@ pub(super) async fn gather_inputs(
 
     // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
     tenant
-        .refresh_gc_info()
+        .refresh_gc_info(ctx)
         .await
         .context("Failed to refresh gc_info before gathering inputs")?;
 
@@ -329,7 +331,13 @@ pub(super) async fn gather_inputs(
             } else {
                 let timeline = Arc::clone(&timeline);
                 let parallel_size_calcs = Arc::clone(limit);
-                joinset.spawn(calculate_logical_size(parallel_size_calcs, timeline, *lsn));
+                let ctx = ctx.attached_child();
+                joinset.spawn(calculate_logical_size(
+                    parallel_size_calcs,
+                    timeline,
+                    *lsn,
+                    ctx,
+                ));
             }
         }
 
@@ -387,6 +395,7 @@ pub(super) async fn gather_inputs(
                 parallel_size_calcs,
                 timeline.clone(),
                 lsn,
+                ctx.attached_child(),
             ));
 
             if let Some(parent_id) = timeline.get_ancestor_timeline_id() {
@@ -582,13 +591,14 @@ async fn calculate_logical_size(
     limit: Arc<tokio::sync::Semaphore>,
     timeline: Arc<crate::tenant::Timeline>,
     lsn: utils::lsn::Lsn,
+    ctx: RequestContext,
 ) -> Result<TimelineAtLsnSizeResult, RecvError> {
     let _permit = tokio::sync::Semaphore::acquire_owned(limit)
         .await
         .expect("global semaphore should not had been closed");
 
     let size_res = timeline
-        .spawn_ondemand_logical_size_calculation(lsn)
+        .spawn_ondemand_logical_size_calculation(lsn, ctx)
         .instrument(info_span!("spawn_ondemand_logical_size_calculation"))
         .await?;
     Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 6aee8ce23c..2149fc7eb7 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -196,3 +196,50 @@ pub fn downcast_remote_layer(
         None
     }
 }
+
+impl std::fmt::Debug for dyn Layer {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Layer")
+            .field("short_id", &self.short_id())
+            .finish()
+    }
+}
+
+/// Holds metadata about a layer without any content. Used mostly for testing.
+pub struct LayerDescriptor {
+    pub key: Range<Key>,
+    pub lsn: Range<Lsn>,
+    pub is_incremental: bool,
+    pub short_id: String,
+}
+
+impl Layer for LayerDescriptor {
+    fn get_key_range(&self) -> Range<Key> {
+        self.key.clone()
+    }
+
+    fn get_lsn_range(&self) -> Range<Lsn> {
+        self.lsn.clone()
+    }
+
+    fn is_incremental(&self) -> bool {
+        self.is_incremental
+    }
+
+    fn get_value_reconstruct_data(
+        &self,
+        _key: Key,
+        _lsn_range: Range<Lsn>,
+        _reconstruct_data: &mut ValueReconstructState,
+    ) -> Result<ValueReconstructResult> {
+        todo!("This method shouldn't be part of the Layer trait")
+    }
+
+    fn short_id(&self) -> String {
+        self.short_id.clone()
+    }
+
+    fn dump(&self, _verbose: bool) -> Result<()> {
+        todo!()
+    }
+}
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index b7ad8fe791..b126545ee4 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -5,6 +5,7 @@ use std::ops::ControlFlow;
 use std::sync::Arc;
 use std::time::Duration;
 
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
@@ -52,19 +53,20 @@ async fn compaction_loop(tenant_id: TenantId) {
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
+        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
         loop {
             trace!("waking up");
 
             let tenant = tokio::select! {
                 _ = task_mgr::shutdown_watcher() => {
                     info!("received cancellation request");
-                    return;
+                return;
                 },
                 tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
                     ControlFlow::Break(()) => return,
                     ControlFlow::Continue(tenant) => tenant,
                 },
-            };
+        };
 
             let mut sleep_duration = tenant.get_compaction_period();
             if sleep_duration == Duration::ZERO {
@@ -73,7 +75,7 @@ async fn compaction_loop(tenant_id: TenantId) {
                 sleep_duration = Duration::from_secs(10);
             } else {
                 // Run compaction
-                if let Err(e) = tenant.compaction_iteration().await {
+                if let Err(e) = tenant.compaction_iteration(&ctx).await {
                     sleep_duration = wait_duration;
                     error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
                 }
@@ -103,6 +105,9 @@ async fn gc_loop(tenant_id: TenantId) {
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
+        // GC might require downloading, to find the cutoff LSN that corresponds to the
+        // cutoff specified as time.
+        let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
         loop {
             trace!("waking up");
 
@@ -127,7 +132,7 @@ async fn gc_loop(tenant_id: TenantId) {
             } else {
                 // Run gc
                 if gc_horizon > 0 {
-                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval()).await
+                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await
                     {
                         sleep_duration = wait_duration;
                         error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d59858f582..0ca8a0e491 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,5 +1,7 @@
 //!
 
+mod walreceiver;
+
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::Bytes;
 use fail::fail_point;
@@ -13,6 +15,7 @@ use pageserver_api::models::{
 use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::id::TenantTimelineId;
 
 use std::cmp::{max, min, Ordering};
 use std::collections::HashMap;
@@ -23,6 +26,8 @@ use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
 
+use crate::broker_client::is_broker_client_initialized;
+use crate::context::{DownloadBehavior, RequestContext};
 use crate::tenant::remote_timeline_client::{self, index::LayerFileMetadata};
 use crate::tenant::storage_layer::{
     DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer, LayerFileName,
@@ -58,11 +63,11 @@ use crate::page_cache;
 use crate::repository::GcResult;
 use crate::repository::{Key, Value};
 use crate::task_mgr::TaskKind;
-use crate::walreceiver::{is_broker_client_initialized, spawn_connection_manager_task};
 use crate::walredo::WalRedoManager;
 use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};
+use walreceiver::spawn_connection_manager_task;
 
 use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
@@ -128,7 +133,6 @@ pub struct Timeline {
     ancestor_timeline: Option<Arc<Timeline>>,
     ancestor_lsn: Lsn,
 
-    // Metrics
     metrics: TimelineMetrics,
 
     /// Ensures layers aren't frozen by checkpointer between
@@ -377,6 +381,12 @@ pub enum PageReconstructError {
     #[error(transparent)]
     Other(#[from] anyhow::Error), // source and Display delegate to anyhow::Error
 
+    /// The operation would require downloading a layer that is missing locally.
+    NeedsDownload(TenantTimelineId, LayerFileName),
+
+    /// The operation was cancelled
+    Cancelled,
+
     /// An error happened replaying WAL records
     #[error(transparent)]
     WalRedo(#[from] crate::walredo::WalRedoError),
@@ -386,6 +396,33 @@ impl std::fmt::Debug for PageReconstructError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
         match self {
             Self::Other(err) => err.fmt(f),
+            Self::NeedsDownload(tenant_timeline_id, layer_file_name) => {
+                write!(
+                    f,
+                    "layer {}/{} needs download",
+                    tenant_timeline_id,
+                    layer_file_name.file_name()
+                )
+            }
+            Self::Cancelled => write!(f, "cancelled"),
+            Self::WalRedo(err) => err.fmt(f),
+        }
+    }
+}
+
+impl std::fmt::Display for PageReconstructError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
+        match self {
+            Self::Other(err) => err.fmt(f),
+            Self::NeedsDownload(tenant_timeline_id, layer_file_name) => {
+                write!(
+                    f,
+                    "layer {}/{} needs download",
+                    tenant_timeline_id,
+                    layer_file_name.file_name()
+                )
+            }
+            Self::Cancelled => write!(f, "cancelled"),
             Self::WalRedo(err) => err.fmt(f),
         }
     }
@@ -422,11 +459,24 @@ impl Timeline {
     /// an ancestor branch, for example, or waste a lot of cycles chasing the
     /// non-existing key.
     ///
-    pub async fn get(&self, key: Key, lsn: Lsn) -> Result<Bytes, PageReconstructError> {
+    pub async fn get(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
         if !lsn.is_valid() {
             return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
         }
 
+        // XXX: structured stats collection for layer eviction here.
+        trace!(
+            "get page request for {}@{} from task kind {:?}",
+            key,
+            lsn,
+            ctx.task_kind()
+        );
+
         // Check the page cache. We will get back the most recent page with lsn <= `lsn`.
         // The cached image can be returned directly if there is no WAL between the cached image
         // and requested LSN. The cached image can also be used to reduce the amount of WAL needed
@@ -450,7 +500,7 @@ impl Timeline {
             img: cached_page_img,
         };
 
-        self.get_reconstruct_data(key, lsn, &mut reconstruct_state)
+        self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
             .await?;
 
         self.metrics
@@ -513,13 +563,25 @@ impl Timeline {
     /// You should call this before any of the other get_* or list_* functions. Calling
     /// those functions with an LSN that has been processed yet is an error.
     ///
-    pub async fn wait_lsn(&self, lsn: Lsn) -> anyhow::Result<()> {
+    pub async fn wait_lsn(
+        &self,
+        lsn: Lsn,
+        _ctx: &RequestContext, /* Prepare for use by cancellation */
+    ) -> anyhow::Result<()> {
         anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline");
 
         // This should never be called from the WAL receiver, because that could lead
         // to a deadlock.
         anyhow::ensure!(
-            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnection),
+            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
+            "wait_lsn cannot be called in WAL receiver"
+        );
+        anyhow::ensure!(
+            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
+            "wait_lsn cannot be called in WAL receiver"
+        );
+        anyhow::ensure!(
+            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
             "wait_lsn cannot be called in WAL receiver"
         );
 
@@ -558,7 +620,7 @@ impl Timeline {
         self.flush_frozen_layers_and_wait().await
     }
 
-    pub async fn compact(&self) -> anyhow::Result<()> {
+    pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
         let last_record_lsn = self.get_last_record_lsn();
 
         // Last record Lsn could be zero in case the timeline was just created
@@ -616,14 +678,16 @@ impl Timeline {
             .repartition(
                 self.get_last_record_lsn(),
                 self.get_compaction_target_size(),
+                ctx,
             )
             .await
         {
             Ok((partitioning, lsn)) => {
                 // 2. Create new image layers for partitions that have been modified
                 // "enough".
-                let layer_paths_to_upload =
-                    self.create_image_layers(&partitioning, lsn, false).await?;
+                let layer_paths_to_upload = self
+                    .create_image_layers(&partitioning, lsn, false, ctx)
+                    .await?;
                 if let Some(remote_client) = &self.remote_client {
                     for (path, layer_metadata) in layer_paths_to_upload {
                         remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
@@ -673,7 +737,10 @@ impl Timeline {
     /// the initial size calculation has not been run (gets triggered on the first size access).
     ///
     /// return size and boolean flag that shows if the size is exact
-    pub fn get_current_logical_size(self: &Arc<Self>) -> anyhow::Result<(u64, bool)> {
+    pub fn get_current_logical_size(
+        self: &Arc<Self>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<(u64, bool)> {
         let current_size = self.current_logical_size.current_size()?;
         debug!("Current size: {current_size:?}");
 
@@ -683,7 +750,7 @@ impl Timeline {
             (current_size, self.current_logical_size.initial_part_end)
         {
             is_exact = false;
-            self.try_spawn_size_init_task(init_lsn);
+            self.try_spawn_size_init_task(init_lsn, ctx);
         }
 
         Ok((size, is_exact))
@@ -729,16 +796,24 @@ impl Timeline {
         Ok(())
     }
 
+    pub fn activate(self: &Arc<Self>) {
+        self.set_state(TimelineState::Active);
+        self.launch_wal_receiver();
+    }
+
     pub fn set_state(&self, new_state: TimelineState) {
         match (self.current_state(), new_state) {
             (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
-                debug!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
+                warn!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
+            }
+            (st, TimelineState::Loading) => {
+                error!("ignoring transition from {st:?} into Loading state");
             }
             (TimelineState::Broken, _) => {
                 error!("Ignoring state update {new_state:?} for broken tenant");
             }
             (TimelineState::Stopping, TimelineState::Active) => {
-                debug!("Not activating a Stopping timeline");
+                error!("Not activating a Stopping timeline");
             }
             (_, new_state) => {
                 self.state.send_replace(new_state);
@@ -812,7 +887,7 @@ impl Timeline {
         pg_version: u32,
     ) -> Arc<Self> {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
-        let (state, _) = watch::channel(TimelineState::Suspended);
+        let (state, _) = watch::channel(TimelineState::Loading);
 
         let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
         let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));
@@ -884,6 +959,10 @@ impl Timeline {
             };
             result.repartition_threshold = result.get_checkpoint_distance() / 10;
             result
+                .metrics
+                .last_record_gauge
+                .set(disk_consistent_lsn.0 as i64);
+            result
         })
     }
 
@@ -909,22 +988,25 @@ impl Timeline {
 
         let layer_flush_start_rx = self.layer_flush_start_tx.subscribe();
         let self_clone = Arc::clone(self);
+
         info!("spawning flush loop");
         task_mgr::spawn(
-                    task_mgr::BACKGROUND_RUNTIME.handle(),
-                    task_mgr::TaskKind::LayerFlushTask,
-                    Some(self.tenant_id),
-                    Some(self.timeline_id),
-                    "layer flush task",
-                    false,
-                    async move {
-                         self_clone.flush_loop(layer_flush_start_rx).await;
-                         let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
-                         assert_eq!(*flush_loop_state, FlushLoopState::Running);
-                         *flush_loop_state  = FlushLoopState::Exited;
-                         Ok(()) }
-                    .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
-                );
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            task_mgr::TaskKind::LayerFlushTask,
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+            "layer flush task",
+            false,
+            async move {
+                let background_ctx = RequestContext::todo_child(TaskKind::LayerFlushTask, DownloadBehavior::Error);
+                self_clone.flush_loop(layer_flush_start_rx, &background_ctx).await;
+                let mut flush_loop_state = self_clone.flush_loop_state.lock().unwrap();
+                assert_eq!(*flush_loop_state, FlushLoopState::Running);
+                *flush_loop_state  = FlushLoopState::Exited;
+                Ok(())
+            }
+            .instrument(info_span!(parent: None, "layer flush task", tenant = %self.tenant_id, timeline = %self.timeline_id))
+        );
 
         *flush_loop_state = FlushLoopState::Running;
     }
@@ -955,12 +1037,16 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
         drop(tenant_conf_guard);
         let self_clone = Arc::clone(self);
+        let background_ctx =
+            // XXX: this is a detached_child. Plumb through the ctx from call sites.
+            RequestContext::todo_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
         spawn_connection_manager_task(
             self_clone,
             walreceiver_connect_timeout,
             lagging_wal_timeout,
             max_lsn_wal_lag,
             crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
+            background_ctx,
         );
     }
 
@@ -970,6 +1056,7 @@ impl Timeline {
     ///
     pub(super) fn load_layer_map(&self, disk_consistent_lsn: Lsn) -> anyhow::Result<()> {
         let mut layers = self.layers.write().unwrap();
+        let mut updates = layers.batch_update();
         let mut num_layers = 0;
 
         let timer = self.metrics.load_layer_map_histo.start_timer();
@@ -1010,7 +1097,7 @@ impl Timeline {
 
                 trace!("found layer {}", layer.path().display());
                 total_physical_size += file_size;
-                layers.insert_historic(Arc::new(layer));
+                updates.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
                 // Create a DeltaLayer struct for each delta file.
@@ -1041,7 +1128,7 @@ impl Timeline {
 
                 trace!("found layer {}", layer.path().display());
                 total_physical_size += file_size;
-                layers.insert_historic(Arc::new(layer));
+                updates.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                 // ignore these
@@ -1067,6 +1154,7 @@ impl Timeline {
             }
         }
 
+        updates.flush();
         layers.next_open_layer_at = Some(Lsn(disk_consistent_lsn.0) + 1);
 
         info!(
@@ -1091,6 +1179,11 @@ impl Timeline {
         // Are we missing some files that are present in remote storage?
         // Create RemoteLayer instances for them.
         let mut local_only_layers = local_layers;
+
+        // We're holding a layer map lock for a while but this
+        // method is only called during init so it's fine.
+        let mut layer_map = self.layers.write().unwrap();
+        let mut updates = layer_map.batch_update();
         for remote_layer_name in &index_part.timeline_layers {
             let local_layer = local_only_layers.remove(remote_layer_name);
 
@@ -1129,7 +1222,7 @@ impl Timeline {
                             anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
                         } else {
                             self.metrics.resident_physical_size_gauge.sub(local_size);
-                            self.layers.write().unwrap().remove_historic(local_layer);
+                            updates.remove_historic(local_layer);
                             // fall-through to adding the remote layer
                         }
                     } else {
@@ -1171,7 +1264,7 @@ impl Timeline {
                     );
                     let remote_layer = Arc::new(remote_layer);
 
-                    self.layers.write().unwrap().insert_historic(remote_layer);
+                    updates.insert_historic(remote_layer);
                 }
                 LayerFileName::Delta(deltafilename) => {
                     // Create a RemoteLayer for the delta file.
@@ -1194,13 +1287,14 @@ impl Timeline {
                         &remote_layer_metadata,
                     );
                     let remote_layer = Arc::new(remote_layer);
-                    self.layers.write().unwrap().insert_historic(remote_layer);
+                    updates.insert_historic(remote_layer);
                 }
                 #[cfg(test)]
                 LayerFileName::Test(_) => unreachable!(),
             }
         }
 
+        updates.flush();
         Ok(local_only_layers)
     }
 
@@ -1280,7 +1374,7 @@ impl Timeline {
         Ok(())
     }
 
-    fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn) {
+    fn try_spawn_size_init_task(self: &Arc<Self>, init_lsn: Lsn, ctx: &RequestContext) {
         let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
             .try_acquire_owned()
         {
@@ -1296,8 +1390,18 @@ impl Timeline {
             .initial_logical_size
             .get()
             .is_none());
+
+        info!(
+            "spawning logical size computation from context of task kind {:?}",
+            ctx.task_kind()
+        );
         // We need to start the computation task.
+        // It gets a separate context since it will outlive the request that called this function.
         let self_clone = Arc::clone(self);
+        let background_ctx = ctx.detached_child(
+            TaskKind::InitialLogicalSizeCalculation,
+            DownloadBehavior::Download,
+        );
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
             task_mgr::TaskKind::InitialLogicalSizeCalculation,
@@ -1307,7 +1411,9 @@ impl Timeline {
             false,
             // NB: don't log errors here, task_mgr will do that.
             async move {
-                let calculated_size = match self_clone.logical_size_calculation_task(init_lsn).await
+                let calculated_size = match self_clone
+                    .logical_size_calculation_task(init_lsn, &background_ctx)
+                    .await
                 {
                     Ok(s) => s,
                     Err(CalculateLogicalSizeError::Cancelled) => {
@@ -1342,18 +1448,27 @@ impl Timeline {
     pub fn spawn_ondemand_logical_size_calculation(
         self: &Arc<Self>,
         lsn: Lsn,
+        ctx: RequestContext,
     ) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
         let (sender, receiver) = oneshot::channel();
         let self_clone = Arc::clone(self);
+        // XXX if our caller loses interest, i.e., ctx is cancelled,
+        // we should stop the size calculation work and return an error.
+        // That would require restructuring this function's API to
+        // return the result directly, instead of a Receiver for the result.
+        let ctx = ctx.detached_child(
+            TaskKind::OndemandLogicalSizeCalculation,
+            DownloadBehavior::Download,
+        );
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
-            task_mgr::TaskKind::InitialLogicalSizeCalculation,
+            task_mgr::TaskKind::OndemandLogicalSizeCalculation,
             Some(self.tenant_id),
             Some(self.timeline_id),
             "ondemand logical size calculation",
             false,
             async move {
-                let res = self_clone.logical_size_calculation_task(lsn).await;
+                let res = self_clone.logical_size_calculation_task(lsn, &ctx).await;
                 let _ = sender.send(res).ok();
                 Ok(()) // Receiver is responsible for handling errors
             },
@@ -1365,6 +1480,7 @@ impl Timeline {
     async fn logical_size_calculation_task(
         self: &Arc<Self>,
         init_lsn: Lsn,
+        ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
         let mut timeline_state_updates = self.subscribe_for_state_updates();
         let self_calculation = Arc::clone(self);
@@ -1372,12 +1488,13 @@ impl Timeline {
 
         let calculation = async {
             let cancel = cancel.child_token();
+            let ctx = ctx.attached_child();
             tokio::task::spawn_blocking(move || {
                 // Run in a separate thread since this can do a lot of
                 // synchronous file IO without .await inbetween
                 // if there are no RemoteLayers that would require downloading.
                 let h = tokio::runtime::Handle::current();
-                h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel))
+                h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel, &ctx))
             })
             .await
             .context("Failed to spawn calculation result task")?
@@ -1392,7 +1509,7 @@ impl Timeline {
                             TimelineState::Active => continue,
                             TimelineState::Broken
                             | TimelineState::Stopping
-                            | TimelineState::Suspended => {
+                            | TimelineState::Loading => {
                                 break format!("aborted because timeline became inactive (new state: {new_state:?})")
                             }
                         }
@@ -1432,10 +1549,11 @@ impl Timeline {
     ///
     /// NOTE: counted incrementally, includes ancestors. This can be a slow operation,
     /// especially if we need to download remote layers.
-    async fn calculate_logical_size(
+    pub async fn calculate_logical_size(
         &self,
         up_to_lsn: Lsn,
         cancel: CancellationToken,
+        ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
         info!(
             "Calculating logical size for timeline {} at {}",
@@ -1478,7 +1596,7 @@ impl Timeline {
             self.metrics.logical_size_histo.start_timer()
         };
         let logical_size = self
-            .get_current_logical_size_non_incremental(up_to_lsn, cancel)
+            .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
             .await?;
         debug!("calculated logical size: {logical_size}");
         timer.stop_and_record();
@@ -1555,6 +1673,7 @@ impl Timeline {
         key: Key,
         request_lsn: Lsn,
         reconstruct_state: &mut ValueReconstructState,
+        ctx: &RequestContext,
     ) -> Result<(), PageReconstructError> {
         // Start from the current timeline.
         let mut timeline_owned;
@@ -1742,14 +1861,43 @@ impl Timeline {
                 let remote_layer_as_persistent: Arc<dyn PersistentLayer> =
                     Arc::clone(&remote_layer) as Arc<dyn PersistentLayer>;
                 let id = remote_layer_as_persistent.traversal_id();
-                info!("need remote layer {id}");
+                info!(
+                    "need remote layer {} for task kind {:?}",
+                    id,
+                    ctx.task_kind()
+                );
 
                 // The next layer doesn't exist locally. Need to download it.
                 // (The control flow is a bit complicated here because we must drop the 'layers'
                 // lock before awaiting on the Future.)
-                info!("on-demand downloading remote layer {id}");
-                timeline.download_remote_layer(remote_layer).await?;
-                continue 'layer_map_search;
+                match (
+                    ctx.download_behavior(),
+                    self.conf.ondemand_download_behavior_treat_error_as_warn,
+                ) {
+                    (DownloadBehavior::Download, _) => {
+                        info!(
+                            "on-demand downloading remote layer {id} for task kind {:?}",
+                            ctx.task_kind()
+                        );
+                        timeline.download_remote_layer(remote_layer).await?;
+                        continue 'layer_map_search;
+                    }
+                    (DownloadBehavior::Warn, _) | (DownloadBehavior::Error, true) => {
+                        warn!(
+                            "unexpectedly on-demand downloading remote layer {} for task kind {:?}",
+                            id,
+                            ctx.task_kind()
+                        );
+                        timeline.download_remote_layer(remote_layer).await?;
+                        continue 'layer_map_search;
+                    }
+                    (DownloadBehavior::Error, false) => {
+                        return Err(PageReconstructError::NeedsDownload(
+                            TenantTimelineId::new(self.tenant_id, self.timeline_id),
+                            remote_layer.file_name.clone(),
+                        ))
+                    }
+                }
             }
         }
     }
@@ -1871,7 +2019,11 @@ impl Timeline {
     }
 
     /// Layer flusher task's main loop.
-    async fn flush_loop(&self, mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>) {
+    async fn flush_loop(
+        &self,
+        mut layer_flush_start_rx: tokio::sync::watch::Receiver<u64>,
+        ctx: &RequestContext,
+    ) {
         info!("started flush loop");
         loop {
             tokio::select! {
@@ -1892,7 +2044,7 @@ impl Timeline {
                     // drop 'layers' lock to allow concurrent reads and writes
                 };
                 if let Some(layer_to_flush) = layer_to_flush {
-                    if let Err(err) = self.flush_frozen_layer(layer_to_flush).await {
+                    if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await {
                         error!("could not flush frozen layer: {err:?}");
                         break Err(err);
                     }
@@ -1957,8 +2109,12 @@ impl Timeline {
     }
 
     /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    #[instrument(skip(self, frozen_layer), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
-    async fn flush_frozen_layer(&self, frozen_layer: Arc<InMemoryLayer>) -> anyhow::Result<()> {
+    #[instrument(skip(self, frozen_layer, ctx), fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
+    async fn flush_frozen_layer(
+        &self,
+        frozen_layer: Arc<InMemoryLayer>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         // As a special case, when we have just imported an image into the repository,
         // instead of writing out a L0 delta layer, we directly write out image layer
         // files instead. This is possible as long as *all* the data imported into the
@@ -1966,10 +2122,12 @@ impl Timeline {
         let lsn_range = frozen_layer.get_lsn_range();
         let layer_paths_to_upload =
             if lsn_range.start == self.initdb_lsn && lsn_range.end == Lsn(self.initdb_lsn.0 + 1) {
+                // Note: The 'ctx' in use here has DownloadBehavior::Error. We should not
+                // require downloading anything during initial import.
                 let (partitioning, _lsn) = self
-                    .repartition(self.initdb_lsn, self.get_compaction_target_size())
+                    .repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx)
                     .await?;
-                self.create_image_layers(&partitioning, self.initdb_lsn, true)
+                self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
                     .await?
             } else {
                 // normal case, write out a L0 delta layer file.
@@ -2099,10 +2257,11 @@ impl Timeline {
         ])?;
 
         // Add it to the layer map
-        {
-            let mut layers = self.layers.write().unwrap();
-            layers.insert_historic(Arc::new(new_delta));
-        }
+        self.layers
+            .write()
+            .unwrap()
+            .batch_update()
+            .insert_historic(Arc::new(new_delta));
 
         // update the timeline's physical size
         let sz = new_delta_path.metadata()?.len();
@@ -2119,6 +2278,7 @@ impl Timeline {
         &self,
         lsn: Lsn,
         partition_size: u64,
+        ctx: &RequestContext,
     ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
         {
             let partitioning_guard = self.partitioning.lock().unwrap();
@@ -2129,7 +2289,7 @@ impl Timeline {
                 return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
             }
         }
-        let keyspace = self.collect_keyspace(lsn).await?;
+        let keyspace = self.collect_keyspace(lsn, ctx).await?;
         let partitioning = keyspace.partition(partition_size);
 
         let mut partitioning_guard = self.partitioning.lock().unwrap();
@@ -2166,13 +2326,15 @@ impl Timeline {
                 // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed
                 // after we read last_record_lsn, which is passed here in the 'lsn' argument.
                 if img_lsn < lsn {
-                    let num_deltas = layers.count_deltas(&img_range, &(img_lsn..lsn))?;
+                    let threshold = self.get_image_creation_threshold();
+                    let num_deltas =
+                        layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
 
                     debug!(
                         "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
                         img_range.start, img_range.end, num_deltas, img_lsn, lsn
                     );
-                    if num_deltas >= self.get_image_creation_threshold() {
+                    if num_deltas >= threshold {
                         return Ok(true);
                     }
                 }
@@ -2187,6 +2349,7 @@ impl Timeline {
         partitioning: &KeyPartitioning,
         lsn: Lsn,
         force: bool,
+        ctx: &RequestContext,
     ) -> Result<HashMap<LayerFileName, LayerFileMetadata>, PageReconstructError> {
         let timer = self.metrics.create_images_time_histo.start_timer();
         let mut image_layers: Vec<ImageLayer> = Vec::new();
@@ -2211,7 +2374,7 @@ impl Timeline {
                 for range in &partition.ranges {
                     let mut key = range.start;
                     while key < range.end {
-                        let img = match self.get(key, lsn).await {
+                        let img = match self.get(key, lsn, ctx).await {
                             Ok(img) => img,
                             Err(err) => {
                                 // If we fail to reconstruct a VM or FSM page, we can zero the
@@ -2267,21 +2430,23 @@ impl Timeline {
         let mut layer_paths_to_upload = HashMap::with_capacity(image_layers.len());
 
         let mut layers = self.layers.write().unwrap();
+        let mut updates = layers.batch_update();
         let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
         for l in image_layers {
             let path = l.filename();
             let metadata = timeline_path
                 .join(path.file_name())
                 .metadata()
-                .context("reading metadata of layer file {path}")?;
+                .with_context(|| format!("reading metadata of layer file {}", path.file_name()))?;
 
             layer_paths_to_upload.insert(path, LayerFileMetadata::new(metadata.len()));
 
             self.metrics
                 .resident_physical_size_gauge
                 .add(metadata.len());
-            layers.insert_historic(Arc::new(l));
+            updates.insert_historic(Arc::new(l));
         }
+        updates.flush();
         drop(layers);
         timer.stop_and_record();
 
@@ -2577,6 +2742,7 @@ impl Timeline {
         }
 
         let mut layers = self.layers.write().unwrap();
+        let mut updates = layers.batch_update();
         let mut new_layer_paths = HashMap::with_capacity(new_layers.len());
         for l in new_layers {
             let new_delta_path = l.path();
@@ -2597,7 +2763,7 @@ impl Timeline {
 
             new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
             let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
-            layers.insert_historic(x);
+            updates.insert_historic(x);
         }
 
         // Now that we have reshuffled the data to set of new delta layers, we can
@@ -2611,8 +2777,9 @@ impl Timeline {
             }
             layer_names_to_delete.push(l.filename());
             l.delete()?;
-            layers.remove_historic(l);
+            updates.remove_historic(l);
         }
+        updates.flush();
         drop(layers);
 
         // Also schedule the deletions in remote storage
@@ -2662,6 +2829,7 @@ impl Timeline {
         retain_lsns: Vec<Lsn>,
         cutoff_horizon: Lsn,
         pitr: Duration,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // First, calculate pitr_cutoff_timestamp and then convert it to LSN.
         //
@@ -2674,7 +2842,7 @@ impl Timeline {
             if let Some(pitr_cutoff_timestamp) = now.checked_sub(pitr) {
                 let pitr_timestamp = to_pg_timestamp(pitr_cutoff_timestamp);
 
-                match self.find_lsn_for_timestamp(pitr_timestamp).await? {
+                match self.find_lsn_for_timestamp(pitr_timestamp, ctx).await? {
                     LsnForTimestamp::Present(lsn) => lsn,
                     LsnForTimestamp::Future(lsn) => {
                         // The timestamp is in the future. That sounds impossible,
@@ -2725,6 +2893,8 @@ impl Timeline {
     /// obsolete.
     ///
     pub(super) async fn gc(&self) -> anyhow::Result<GcResult> {
+        let timer = self.metrics.garbage_collect_histo.start_timer();
+
         fail_point!("before-timeline-gc");
 
         let _layer_removal_cs = self.layer_removal_cs.lock().await;
@@ -2745,11 +2915,17 @@ impl Timeline {
 
         let new_gc_cutoff = Lsn::min(horizon_cutoff, pitr_cutoff);
 
-        self.gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
+        let res = self
+            .gc_timeline(horizon_cutoff, pitr_cutoff, retain_lsns, new_gc_cutoff)
             .instrument(
                 info_span!("gc_timeline", timeline = %self.timeline_id, cutoff = %new_gc_cutoff),
             )
-            .await
+            .await?;
+
+        // only record successes
+        timer.stop_and_record();
+
+        Ok(res)
     }
 
     async fn gc_timeline(
@@ -2812,6 +2988,7 @@ impl Timeline {
         // 3. it doesn't need to be retained for 'retain_lsns';
         // 4. newer on-disk image layers cover the layer's whole key range
         //
+        // TODO holding a write lock is too agressive and avoidable
         let mut layers = self.layers.write().unwrap();
         'outer: for l in layers.iter_historic_layers() {
             result.layers_total += 1;
@@ -2843,6 +3020,8 @@ impl Timeline {
             // might be referenced by child branches forever.
             // We can track this in child timeline GC and delete parent layers when
             // they are no longer needed. This might be complicated with long inheritance chains.
+            //
+            // TODO Vec is not a great choice for `retain_lsns`
             for retain_lsn in &retain_lsns {
                 // start_lsn is inclusive
                 if &l.get_lsn_range().start <= retain_lsn {
@@ -2896,6 +3075,7 @@ impl Timeline {
             layers_to_remove.push(Arc::clone(&l));
         }
 
+        let mut updates = layers.batch_update();
         if !layers_to_remove.is_empty() {
             // Persist the new GC cutoff value in the metadata file, before
             // we actually remove anything.
@@ -2913,7 +3093,13 @@ impl Timeline {
                 }
                 layer_names_to_delete.push(doomed_layer.filename());
                 doomed_layer.delete()?; // FIXME: schedule succeeded deletions before returning?
-                layers.remove_historic(doomed_layer);
+
+                // TODO Removing from the bottom of the layer map is expensive.
+                //      Maybe instead discard all layer map historic versions that
+                //      won't be needed for page reconstruction for this timeline,
+                //      and mark what we can't delete yet as deleted from the layer
+                //      map index without actually rebuilding the index.
+                updates.remove_historic(doomed_layer);
                 result.layers_removed += 1;
             }
 
@@ -2925,6 +3111,7 @@ impl Timeline {
                 remote_client.schedule_layer_file_deletion(&layer_names_to_delete)?;
             }
         }
+        updates.flush();
 
         info!(
             "GC completed removing {} layers, cutoff {}",
@@ -3081,11 +3268,13 @@ impl Timeline {
                     // Delta- or ImageLayer in the layer map.
                     let new_layer = remote_layer.create_downloaded_layer(self_clone.conf, *size);
                     let mut layers = self_clone.layers.write().unwrap();
+                    let mut updates = layers.batch_update();
                     {
                         let l: Arc<dyn PersistentLayer> = remote_layer.clone();
-                        layers.remove_historic(l);
+                        updates.remove_historic(l);
                     }
-                    layers.insert_historic(new_layer);
+                    updates.insert_historic(new_layer);
+                    updates.flush();
                     drop(layers);
 
                     // Now that we've inserted the download into the layer map,
diff --git a/pageserver/src/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
similarity index 83%
rename from pageserver/src/walreceiver.rs
rename to pageserver/src/tenant/timeline/walreceiver.rs
index fc9daadc5c..f33a12c5cc 100644
--- a/pageserver/src/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -23,58 +23,15 @@
 mod connection_manager;
 mod walreceiver_connection;
 
-use crate::config::PageServerConf;
 use crate::task_mgr::WALRECEIVER_RUNTIME;
 
-use anyhow::Context;
-use once_cell::sync::OnceCell;
 use std::future::Future;
-use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 
 pub use connection_manager::spawn_connection_manager_task;
 
-static BROKER_CLIENT: OnceCell<BrokerClientChannel> = OnceCell::new();
-
-///
-/// Initialize the broker client. This must be called once at page server startup.
-///
-pub async fn init_broker_client(conf: &'static PageServerConf) -> anyhow::Result<()> {
-    let broker_endpoint = conf.broker_endpoint.clone();
-
-    // Note: we do not attempt connecting here (but validate endpoints sanity).
-    let broker_client =
-        storage_broker::connect(broker_endpoint.clone(), conf.broker_keepalive_interval).context(
-            format!(
-                "Failed to create broker client to {}",
-                &conf.broker_endpoint
-            ),
-        )?;
-
-    if BROKER_CLIENT.set(broker_client).is_err() {
-        panic!("broker already initialized");
-    }
-
-    info!(
-        "Initialized broker client with endpoints: {}",
-        broker_endpoint
-    );
-    Ok(())
-}
-
-///
-/// Get a handle to the broker client
-///
-pub fn get_broker_client() -> &'static BrokerClientChannel {
-    BROKER_CLIENT.get().expect("broker client not initialized")
-}
-
-pub fn is_broker_client_initialized() -> bool {
-    BROKER_CLIENT.get().is_some()
-}
-
 /// A handle of an asynchronous task.
 /// The task has a channel that it can use to communicate its lifecycle events in a certain form, see [`TaskEvent`]
 /// and a cancellation token that it can listen to for earlier interrupts.
@@ -95,7 +52,6 @@ pub enum TaskEvent<E> {
 
 #[derive(Debug, Clone)]
 pub enum TaskStateUpdate<E> {
-    Init,
     Started,
     Progress(E),
 }
diff --git a/pageserver/src/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
similarity index 96%
rename from pageserver/src/walreceiver/connection_manager.rs
rename to pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 8b60e59305..cd7c7c51d2 100644
--- a/pageserver/src/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -11,10 +11,12 @@
 
 use std::{collections::HashMap, num::NonZeroU64, ops::ControlFlow, sync::Arc, time::Duration};
 
-use crate::task_mgr::TaskKind;
+use super::TaskStateUpdate;
+use crate::broker_client::get_broker_client;
+use crate::context::RequestContext;
 use crate::task_mgr::WALRECEIVER_RUNTIME;
+use crate::task_mgr::{self, TaskKind};
 use crate::tenant::Timeline;
-use crate::{task_mgr, walreceiver::TaskStateUpdate};
 use anyhow::Context;
 use chrono::{NaiveDateTime, Utc};
 use pageserver_api::models::TimelineState;
@@ -27,10 +29,7 @@ use storage_broker::Streaming;
 use tokio::{select, sync::watch};
 use tracing::*;
 
-use crate::{
-    exponential_backoff, walreceiver::get_broker_client, DEFAULT_BASE_BACKOFF_SECONDS,
-    DEFAULT_MAX_BACKOFF_SECONDS,
-};
+use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use postgres_connection::{parse_host_port, PgConnectionConfig};
 use utils::{
     id::{NodeId, TenantTimelineId},
@@ -46,6 +45,7 @@ pub fn spawn_connection_manager_task(
     lagging_wal_timeout: Duration,
     max_lsn_wal_lag: NonZeroU64,
     auth_token: Option<Arc<String>>,
+    ctx: RequestContext,
 ) {
     let mut broker_client = get_broker_client().clone();
 
@@ -78,6 +78,7 @@ pub fn spawn_connection_manager_task(
                     loop_step_result = connection_manager_loop_step(
                         &mut broker_client,
                         &mut walreceiver_state,
+                        &ctx,
                     ) => match loop_step_result {
                         ControlFlow::Continue(()) => continue,
                         ControlFlow::Break(()) => {
@@ -101,6 +102,7 @@ pub fn spawn_connection_manager_task(
 async fn connection_manager_loop_step(
     broker_client: &mut BrokerClientChannel,
     walreceiver_state: &mut WalreceiverState,
+    ctx: &RequestContext,
 ) -> ControlFlow<(), ()> {
     let mut timeline_state_updates = walreceiver_state.timeline.subscribe_for_state_updates();
 
@@ -145,7 +147,7 @@ async fn connection_manager_loop_step(
                 let wal_connection = walreceiver_state.wal_connection.as_mut()
                     .expect("Should have a connection, as checked by the corresponding select! guard");
                 match wal_connection_update {
-                    TaskEvent::Update(TaskStateUpdate::Init | TaskStateUpdate::Started) => {},
+                    TaskEvent::Update(TaskStateUpdate::Started) => {},
                     TaskEvent::Update(TaskStateUpdate::Progress(new_status)) => {
                         if new_status.has_processed_wal {
                             // We have advanced last_record_lsn by processing the WAL received
@@ -183,13 +185,23 @@ async fn connection_manager_loop_step(
 
             new_event = async {
                 loop {
+                    if walreceiver_state.timeline.current_state() == TimelineState::Loading {
+                        warn!("wal connection manager should only be launched after timeline has become active");
+                    }
                     match timeline_state_updates.changed().await {
                         Ok(()) => {
                             let new_state = walreceiver_state.timeline.current_state();
                             match new_state {
                                 // we're already active as walreceiver, no need to reactivate
                                 TimelineState::Active => continue,
-                                TimelineState::Broken | TimelineState::Stopping | TimelineState::Suspended => return ControlFlow::Continue(new_state),
+                                TimelineState::Broken | TimelineState::Stopping => {
+                                    info!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
+                                    return ControlFlow::Break(());
+                                }
+                                TimelineState::Loading => {
+                                    warn!("timeline transitioned back to Loading state, that should not happen");
+                                    return ControlFlow::Continue(new_state);
+                                }
                             }
                         }
                         Err(_sender_dropped_error) => return ControlFlow::Break(()),
@@ -197,7 +209,7 @@ async fn connection_manager_loop_step(
                 }
             } => match new_event {
                 ControlFlow::Continue(new_state) => {
-                    info!("Timeline became inactive (new state: {new_state:?}), dropping current connections until it reactivates");
+                    info!("observed timeline state change, new state is {new_state:?}");
                     return ControlFlow::Continue(());
                 }
                 ControlFlow::Break(()) => {
@@ -226,6 +238,7 @@ async fn connection_manager_loop_step(
                 .change_connection(
                     new_candidate.safekeeper_id,
                     new_candidate.wal_source_connconf,
+                    ctx,
                 )
                 .await
         }
@@ -289,7 +302,9 @@ async fn subscribe_for_timeline_updates(
                 return resp.into_inner();
             }
             Err(e) => {
-                warn!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
+                // Safekeeper nodes can stop pushing timeline updates to the broker, when no new writes happen and
+                // entire WAL is streamed. Keep this noticeable with logging, but do not warn/error.
+                info!("Attempt #{attempt}, failed to subscribe for timeline {id} updates in broker: {e:#}");
                 continue;
             }
         }
@@ -389,12 +404,17 @@ impl WalreceiverState {
         &mut self,
         new_sk_id: NodeId,
         new_wal_source_connconf: PgConnectionConfig,
+        ctx: &RequestContext,
     ) {
         self.drop_old_connection(true).await;
 
         let id = self.id;
         let connect_timeout = self.wal_connect_timeout;
         let timeline = Arc::clone(&self.timeline);
+        let ctx = ctx.detached_child(
+            TaskKind::WalReceiverConnectionHandler,
+            ctx.download_behavior(),
+        );
         let connection_handle = TaskHandle::spawn(move |events_sender, cancellation| {
             async move {
                 super::walreceiver_connection::handle_walreceiver_connection(
@@ -403,6 +423,7 @@ impl WalreceiverState {
                     events_sender,
                     cancellation,
                     connect_timeout,
+                    ctx,
                 )
                 .await
                 .context("walreceiver connection handling failure")
@@ -1233,18 +1254,18 @@ mod tests {
     const DUMMY_SAFEKEEPER_HOST: &str = "safekeeper_connstr";
 
     async fn dummy_state(harness: &TenantHarness<'_>) -> WalreceiverState {
+        let (tenant, ctx) = harness.load().await;
+        let timeline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION, &ctx)
+            .expect("Failed to create an empty timeline for dummy wal connection manager");
+        let timeline = timeline.initialize(&ctx).unwrap();
+
         WalreceiverState {
             id: TenantTimelineId {
                 tenant_id: harness.tenant_id,
                 timeline_id: TIMELINE_ID,
             },
-            timeline: harness
-                .load()
-                .await
-                .create_empty_timeline(TIMELINE_ID, Lsn(0), crate::DEFAULT_PG_VERSION)
-                .expect("Failed to create an empty timeline for dummy wal connection manager")
-                .initialize()
-                .unwrap(),
+            timeline,
             wal_connect_timeout: Duration::from_secs(1),
             lagging_wal_timeout: Duration::from_secs(1),
             max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
diff --git a/pageserver/src/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
similarity index 94%
rename from pageserver/src/walreceiver/walreceiver_connection.rs
rename to pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 1b9e4923fb..7e06c398af 100644
--- a/pageserver/src/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -22,7 +22,9 @@ use tokio_postgres::{replication::ReplicationStream, Client};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn};
 
-use crate::{metrics::LIVE_CONNECTIONS_COUNT, walreceiver::TaskStateUpdate};
+use super::TaskStateUpdate;
+use crate::context::RequestContext;
+use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::{
     task_mgr,
     task_mgr::TaskKind,
@@ -62,6 +64,7 @@ pub async fn handle_walreceiver_connection(
     events_sender: watch::Sender<TaskStateUpdate<WalConnectionStatus>>,
     cancellation: CancellationToken,
     connect_timeout: Duration,
+    ctx: RequestContext,
 ) -> anyhow::Result<()> {
     // Connect to the database in replication mode.
     info!("connecting to {wal_source_connconf:?}");
@@ -77,9 +80,13 @@ pub async fn handle_walreceiver_connection(
                 info!("DB connection stream finished: {expected_error}");
                 return Ok(());
             }
-            Err(elapsed) => anyhow::bail!(
-                "Timed out while waiting {elapsed} for walreceiver connection to open"
-            ),
+            Err(_) => {
+                // Timing out to connect to a safekeeper node could happen long time, due to
+                // many reasons that pageserver cannot control.
+                // Do not produce an error, but make it visible, that timeouts happen by logging the `event.
+                info!("Timed out while waiting {connect_timeout:?} for walreceiver connection to open");
+                return Ok(());
+            }
         }
     };
 
@@ -99,10 +106,14 @@ pub async fn handle_walreceiver_connection(
 
     // The connection object performs the actual communication with the database,
     // so spawn it off to run on its own.
+    let _connection_ctx = ctx.detached_child(
+        TaskKind::WalReceiverConnectionPoller,
+        ctx.download_behavior(),
+    );
     let connection_cancellation = cancellation.clone();
     task_mgr::spawn(
         WALRECEIVER_RUNTIME.handle(),
-        TaskKind::WalReceiverConnection,
+        TaskKind::WalReceiverConnectionPoller,
         Some(timeline.tenant_id),
         Some(timeline.timeline_id),
         "walreceiver connection",
@@ -117,7 +128,7 @@ pub async fn handle_walreceiver_connection(
                         }
                     }
                 },
-
+                // Future: replace connection_cancellation with connection_ctx cancellation
                 _ = connection_cancellation.cancelled() => info!("Connection cancelled"),
             }
             Ok(())
@@ -180,7 +191,7 @@ pub async fn handle_walreceiver_connection(
 
     let mut waldecoder = WalStreamDecoder::new(startpoint, timeline.pg_version);
 
-    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint).await?;
+    let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
 
     while let Some(replication_message) = {
         select! {
@@ -251,7 +262,7 @@ pub async fn handle_walreceiver_connection(
                         ensure!(lsn.is_aligned());
 
                         walingest
-                            .ingest_record(recdata.clone(), lsn, &mut modification, &mut decoded)
+                            .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
                             .await
                             .with_context(|| format!("could not ingest record at {lsn}"))?;
 
@@ -329,7 +340,7 @@ pub async fn handle_walreceiver_connection(
             // Send the replication feedback message.
             // Regular standby_status_update fields are put into this message.
             let (timeline_logical_size, _) = timeline
-                .get_current_logical_size()
+                .get_current_logical_size(&ctx)
                 .context("Status update creation failed to get current logical size")?;
             let status_update = ReplicationFeedback {
                 current_timeline_size: timeline_logical_size,
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 0de2e6654d..3761c65668 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -29,6 +29,7 @@ use anyhow::Result;
 use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
 
+use crate::context::RequestContext;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
@@ -52,10 +53,14 @@ pub struct WalIngest<'a> {
 }
 
 impl<'a> WalIngest<'a> {
-    pub async fn new(timeline: &Timeline, startpoint: Lsn) -> anyhow::Result<WalIngest> {
+    pub async fn new(
+        timeline: &'a Timeline,
+        startpoint: Lsn,
+        ctx: &'_ RequestContext,
+    ) -> anyhow::Result<WalIngest<'a>> {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
-        let checkpoint_bytes = timeline.get_checkpoint(startpoint).await?;
+        let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
         let checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
         trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);
 
@@ -80,6 +85,7 @@ impl<'a> WalIngest<'a> {
         lsn: Lsn,
         modification: &mut DatadirModification<'_>,
         decoded: &mut DecodedWALRecord,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         modification.lsn = lsn;
         decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
@@ -97,7 +103,7 @@ impl<'a> WalIngest<'a> {
         if decoded.xl_rmid == pg_constants::RM_HEAP_ID
             || decoded.xl_rmid == pg_constants::RM_HEAP2_ID
         {
-            self.ingest_heapam_record(&mut buf, modification, decoded)
+            self.ingest_heapam_record(&mut buf, modification, decoded, ctx)
                 .await?;
         }
         // Handle other special record types
@@ -106,13 +112,14 @@ impl<'a> WalIngest<'a> {
                 == pg_constants::XLOG_SMGR_CREATE
         {
             let create = XlSmgrCreate::decode(&mut buf);
-            self.ingest_xlog_smgr_create(modification, &create).await?;
+            self.ingest_xlog_smgr_create(modification, &create, ctx)
+                .await?;
         } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID
             && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                 == pg_constants::XLOG_SMGR_TRUNCATE
         {
             let truncate = XlSmgrTruncate::decode(&mut buf);
-            self.ingest_xlog_smgr_truncate(modification, &truncate)
+            self.ingest_xlog_smgr_truncate(modification, &truncate, ctx)
                 .await?;
         } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
             debug!(
@@ -126,7 +133,7 @@ impl<'a> WalIngest<'a> {
                     let createdb = XlCreateDatabase::decode(&mut buf);
                     debug!("XLOG_DBASE_CREATE v14");
 
-                    self.ingest_xlog_dbase_create(modification, &createdb)
+                    self.ingest_xlog_dbase_create(modification, &createdb, ctx)
                         .await?;
                 } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                     == postgres_ffi::v14::bindings::XLOG_DBASE_DROP
@@ -134,7 +141,9 @@ impl<'a> WalIngest<'a> {
                     let dropdb = XlDropDatabase::decode(&mut buf);
                     for tablespace_id in dropdb.tablespace_ids {
                         trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        modification.drop_dbdir(tablespace_id, dropdb.db_id).await?;
+                        modification
+                            .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
+                            .await?;
                     }
                 }
             } else if self.timeline.pg_version == 15 {
@@ -150,7 +159,7 @@ impl<'a> WalIngest<'a> {
                     // So we can reuse XlCreateDatabase here.
                     debug!("XLOG_DBASE_CREATE_FILE_COPY");
                     let createdb = XlCreateDatabase::decode(&mut buf);
-                    self.ingest_xlog_dbase_create(modification, &createdb)
+                    self.ingest_xlog_dbase_create(modification, &createdb, ctx)
                         .await?;
                 } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
                     == postgres_ffi::v15::bindings::XLOG_DBASE_DROP
@@ -158,7 +167,9 @@ impl<'a> WalIngest<'a> {
                     let dropdb = XlDropDatabase::decode(&mut buf);
                     for tablespace_id in dropdb.tablespace_ids {
                         trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        modification.drop_dbdir(tablespace_id, dropdb.db_id).await?;
+                        modification
+                            .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
+                            .await?;
                     }
                 }
             }
@@ -176,12 +187,13 @@ impl<'a> WalIngest<'a> {
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
+                    ctx,
                 )
                 .await?;
             } else {
                 assert!(info == pg_constants::CLOG_TRUNCATE);
                 let xlrec = XlClogTruncate::decode(&mut buf);
-                self.ingest_clog_truncate_record(modification, &xlrec)
+                self.ingest_clog_truncate_record(modification, &xlrec, ctx)
                     .await?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
@@ -193,6 +205,7 @@ impl<'a> WalIngest<'a> {
                     modification,
                     &parsed_xact,
                     info == pg_constants::XLOG_XACT_COMMIT,
+                    ctx,
                 )
                 .await?;
             } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED
@@ -204,6 +217,7 @@ impl<'a> WalIngest<'a> {
                     modification,
                     &parsed_xact,
                     info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
+                    ctx,
                 )
                 .await?;
                 // Remove twophase file. see RemoveTwoPhaseFile() in postgres code
@@ -213,10 +227,12 @@ impl<'a> WalIngest<'a> {
                     parsed_xact.xid,
                     lsn,
                 );
-                modification.drop_twophase_file(parsed_xact.xid).await?;
+                modification
+                    .drop_twophase_file(parsed_xact.xid, ctx)
+                    .await?;
             } else if info == pg_constants::XLOG_XACT_PREPARE {
                 modification
-                    .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]))
+                    .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx)
                     .await?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
@@ -232,6 +248,7 @@ impl<'a> WalIngest<'a> {
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
+                    ctx,
                 )
                 .await?;
             } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
@@ -244,6 +261,7 @@ impl<'a> WalIngest<'a> {
                     segno,
                     rpageno,
                     ZERO_PAGE.clone(),
+                    ctx,
                 )
                 .await?;
             } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
@@ -251,12 +269,12 @@ impl<'a> WalIngest<'a> {
                 self.ingest_multixact_create_record(modification, &xlrec)?;
             } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
                 let xlrec = XlMultiXactTruncate::decode(&mut buf);
-                self.ingest_multixact_truncate_record(modification, &xlrec)
+                self.ingest_multixact_truncate_record(modification, &xlrec, ctx)
                     .await?;
             }
         } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
             let xlrec = XlRelmapUpdate::decode(&mut buf);
-            self.ingest_relmap_page(modification, &xlrec, decoded)
+            self.ingest_relmap_page(modification, &xlrec, decoded, ctx)
                 .await?;
         } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
             let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
@@ -292,7 +310,7 @@ impl<'a> WalIngest<'a> {
         // Iterate through all the blocks that the record modifies, and
         // "put" a separate copy of the record for each block.
         for blk in decoded.blocks.iter() {
-            self.ingest_decoded_block(modification, lsn, decoded, blk)
+            self.ingest_decoded_block(modification, lsn, decoded, blk, ctx)
                 .await?;
         }
 
@@ -317,6 +335,7 @@ impl<'a> WalIngest<'a> {
         lsn: Lsn,
         decoded: &DecodedWALRecord,
         blk: &DecodedBkpBlock,
+        ctx: &RequestContext,
     ) -> Result<(), PageReconstructError> {
         let rel = RelTag {
             spcnode: blk.rnode_spcnode,
@@ -359,14 +378,14 @@ impl<'a> WalIngest<'a> {
                 page_set_lsn(&mut image, lsn)
             }
             assert_eq!(image.len(), BLCKSZ as usize);
-            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze())
+            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze(), ctx)
                 .await?;
         } else {
             let rec = NeonWalRecord::Postgres {
                 will_init: blk.will_init || blk.apply_image,
                 rec: decoded.record.clone(),
             };
-            self.put_rel_wal_record(modification, rel, blk.blkno, rec)
+            self.put_rel_wal_record(modification, rel, blk.blkno, rec, ctx)
                 .await?;
         }
         Ok(())
@@ -377,6 +396,7 @@ impl<'a> WalIngest<'a> {
         buf: &mut Bytes,
         modification: &mut DatadirModification<'_>,
         decoded: &mut DecodedWALRecord,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Handle VM bit updates that are implicitly part of heap records.
 
@@ -456,7 +476,7 @@ impl<'a> WalIngest<'a> {
             // replaying it would fail to find the previous image of the page, because
             // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
             // record if it doesn't.
-            let vm_size = self.get_relsize(vm_rel, modification.lsn).await?;
+            let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
             if let Some(blknum) = new_vm_blk {
                 if blknum >= vm_size {
                     new_vm_blk = None;
@@ -481,6 +501,7 @@ impl<'a> WalIngest<'a> {
                             old_heap_blkno,
                             flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                         },
+                        ctx,
                     )
                     .await?;
                 } else {
@@ -496,6 +517,7 @@ impl<'a> WalIngest<'a> {
                                 old_heap_blkno: None,
                                 flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                             },
+                            ctx,
                         )
                         .await?;
                     }
@@ -509,6 +531,7 @@ impl<'a> WalIngest<'a> {
                                 old_heap_blkno,
                                 flags: pg_constants::VISIBILITYMAP_VALID_BITS,
                             },
+                            ctx,
                         )
                         .await?;
                     }
@@ -524,6 +547,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         rec: &XlCreateDatabase,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let db_id = rec.db_id;
         let tablespace_id = rec.tablespace_id;
@@ -539,7 +563,7 @@ impl<'a> WalIngest<'a> {
 
         let rels = modification
             .tline
-            .list_rels(src_tablespace_id, src_db_id, req_lsn)
+            .list_rels(src_tablespace_id, src_db_id, req_lsn, ctx)
             .await?;
 
         debug!("ingest_xlog_dbase_create: {} rels", rels.len());
@@ -547,10 +571,10 @@ impl<'a> WalIngest<'a> {
         // Copy relfilemap
         let filemap = modification
             .tline
-            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn)
+            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn, ctx)
             .await?;
         modification
-            .put_relmap_file(tablespace_id, db_id, filemap)
+            .put_relmap_file(tablespace_id, db_id, filemap, ctx)
             .await?;
 
         let mut num_rels_copied = 0;
@@ -561,7 +585,7 @@ impl<'a> WalIngest<'a> {
 
             let nblocks = modification
                 .tline
-                .get_rel_size(src_rel, req_lsn, true)
+                .get_rel_size(src_rel, req_lsn, true, ctx)
                 .await?;
             let dst_rel = RelTag {
                 spcnode: tablespace_id,
@@ -570,7 +594,7 @@ impl<'a> WalIngest<'a> {
                 forknum: src_rel.forknum,
             };
 
-            modification.put_rel_creation(dst_rel, nblocks).await?;
+            modification.put_rel_creation(dst_rel, nblocks, ctx).await?;
 
             // Copy content
             debug!("copying rel {} to {}, {} blocks", src_rel, dst_rel, nblocks);
@@ -579,7 +603,7 @@ impl<'a> WalIngest<'a> {
 
                 let content = modification
                     .tline
-                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true)
+                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true, ctx)
                     .await?;
                 modification.put_rel_page_image(dst_rel, blknum, content)?;
                 num_blocks_copied += 1;
@@ -599,6 +623,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         rec: &XlSmgrCreate,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let rel = RelTag {
             spcnode: rec.rnode.spcnode,
@@ -606,7 +631,7 @@ impl<'a> WalIngest<'a> {
             relnode: rec.rnode.relnode,
             forknum: rec.forknum,
         };
-        self.put_rel_creation(modification, rel).await?;
+        self.put_rel_creation(modification, rel, ctx).await?;
         Ok(())
     }
 
@@ -617,6 +642,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         rec: &XlSmgrTruncate,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let spcnode = rec.rnode.spcnode;
         let dbnode = rec.rnode.dbnode;
@@ -629,7 +655,7 @@ impl<'a> WalIngest<'a> {
                 relnode,
                 forknum: MAIN_FORKNUM,
             };
-            self.put_rel_truncation(modification, rel, rec.blkno)
+            self.put_rel_truncation(modification, rel, rec.blkno, ctx)
                 .await?;
         }
         if (rec.flags & pg_constants::SMGR_TRUNCATE_FSM) != 0 {
@@ -648,10 +674,10 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
                 fsm_physical_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn).await?;
+            let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
             if nblocks > fsm_physical_page_no {
                 // check if something to do: FSM is larger than truncate position
-                self.put_rel_truncation(modification, rel, fsm_physical_page_no)
+                self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
                     .await?;
             }
         }
@@ -670,10 +696,10 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
                 vm_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn).await?;
+            let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
             if nblocks > vm_page_no {
                 // check if something to do: VM is larger than truncate position
-                self.put_rel_truncation(modification, rel, vm_page_no)
+                self.put_rel_truncation(modification, rel, vm_page_no, ctx)
                     .await?;
             }
         }
@@ -687,6 +713,7 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         parsed: &XlXactParsedRecord,
         is_commit: bool,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Record update of CLOG pages
         let mut pageno = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE;
@@ -745,10 +772,10 @@ impl<'a> WalIngest<'a> {
                 let last_lsn = self.timeline.get_last_record_lsn();
                 if modification
                     .tline
-                    .get_rel_exists(rel, last_lsn, true)
+                    .get_rel_exists(rel, last_lsn, true, ctx)
                     .await?
                 {
-                    self.put_rel_drop(modification, rel).await?;
+                    self.put_rel_drop(modification, rel, ctx).await?;
                 }
             }
         }
@@ -759,6 +786,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         xlrec: &XlClogTruncate,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         info!(
             "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}",
@@ -799,16 +827,15 @@ impl<'a> WalIngest<'a> {
         // it. So we use the previous record's LSN in the get calls
         // instead.
         let req_lsn = modification.tline.get_last_record_lsn();
-
-        let slru_segments = modification
+        for segno in modification
             .tline
-            .list_slru_segments(SlruKind::Clog, req_lsn)
-            .await?;
-        for segno in slru_segments {
+            .list_slru_segments(SlruKind::Clog, req_lsn, ctx)
+            .await?
+        {
             let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
             if slru_may_delete_clogsegment(segpage, xlrec.pageno) {
                 modification
-                    .drop_slru_segment(SlruKind::Clog, segno)
+                    .drop_slru_segment(SlruKind::Clog, segno, ctx)
                     .await?;
                 trace!("Drop CLOG segment {:>04X}", segno);
             }
@@ -900,6 +927,7 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         xlrec: &XlMultiXactTruncate,
+        ctx: &RequestContext,
     ) -> Result<()> {
         self.checkpoint.oldestMulti = xlrec.end_trunc_off;
         self.checkpoint.oldestMultiDB = xlrec.oldest_multi_db;
@@ -915,7 +943,7 @@ impl<'a> WalIngest<'a> {
         // contain, possibly partially, valid data.
         while segment != endsegment {
             modification
-                .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32)
+                .drop_slru_segment(SlruKind::MultiXactMembers, segment as u32, ctx)
                 .await?;
 
             /* move to next segment, handling wraparound correctly */
@@ -937,6 +965,7 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         xlrec: &XlRelmapUpdate,
         decoded: &DecodedWALRecord,
+        ctx: &RequestContext,
     ) -> Result<()> {
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
@@ -944,18 +973,22 @@ impl<'a> WalIngest<'a> {
         buf.advance(12);
 
         modification
-            .put_relmap_file(xlrec.tsid, xlrec.dbid, Bytes::copy_from_slice(&buf[..]))
-            .await?;
-
-        Ok(())
+            .put_relmap_file(
+                xlrec.tsid,
+                xlrec.dbid,
+                Bytes::copy_from_slice(&buf[..]),
+                ctx,
+            )
+            .await
     }
 
     async fn put_rel_creation(
         &mut self,
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
+        ctx: &RequestContext,
     ) -> Result<()> {
-        modification.put_rel_creation(rel, 0).await?;
+        modification.put_rel_creation(rel, 0, ctx).await?;
         Ok(())
     }
 
@@ -965,8 +998,10 @@ impl<'a> WalIngest<'a> {
         rel: RelTag,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> anyhow::Result<()> {
-        self.handle_rel_extend(modification, rel, blknum).await?;
+        ctx: &RequestContext,
+    ) -> Result<(), PageReconstructError> {
+        self.handle_rel_extend(modification, rel, blknum, ctx)
+            .await?;
         modification.put_rel_page_image(rel, blknum, img)?;
         Ok(())
     }
@@ -977,8 +1012,10 @@ impl<'a> WalIngest<'a> {
         rel: RelTag,
         blknum: BlockNumber,
         rec: NeonWalRecord,
-    ) -> anyhow::Result<()> {
-        self.handle_rel_extend(modification, rel, blknum).await?;
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        self.handle_rel_extend(modification, rel, blknum, ctx)
+            .await?;
         modification.put_rel_wal_record(rel, blknum, rec)?;
         Ok(())
     }
@@ -988,8 +1025,9 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
         nblocks: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        modification.put_rel_truncation(rel, nblocks).await?;
+        modification.put_rel_truncation(rel, nblocks, ctx).await?;
         Ok(())
     }
 
@@ -997,17 +1035,22 @@ impl<'a> WalIngest<'a> {
         &mut self,
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
+        ctx: &RequestContext,
     ) -> Result<()> {
-        modification.put_rel_drop(rel).await?;
+        modification.put_rel_drop(rel, ctx).await?;
         Ok(())
     }
 
-    async fn get_relsize(&mut self, rel: RelTag, lsn: Lsn) -> anyhow::Result<BlockNumber> {
-        let exists = self.timeline.get_rel_exists(rel, lsn, true).await?;
-        let nblocks = if !exists {
+    async fn get_relsize(
+        &mut self,
+        rel: RelTag,
+        lsn: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<BlockNumber> {
+        let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? {
             0
         } else {
-            self.timeline.get_rel_size(rel, lsn, true).await?
+            self.timeline.get_rel_size(rel, lsn, true, ctx).await?
         };
         Ok(nblocks)
     }
@@ -1017,23 +1060,28 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         rel: RelTag,
         blknum: BlockNumber,
-    ) -> anyhow::Result<()> {
+        ctx: &RequestContext,
+    ) -> Result<(), PageReconstructError> {
         let new_nblocks = blknum + 1;
         // Check if the relation exists. We implicitly create relations on first
         // record.
         // TODO: would be nice if to be more explicit about it
         let last_lsn = modification.lsn;
-        let old_nblocks = if !self.timeline.get_rel_exists(rel, last_lsn, true).await? {
+        let old_nblocks = if !self
+            .timeline
+            .get_rel_exists(rel, last_lsn, true, ctx)
+            .await?
+        {
             // create it with 0 size initially, the logic below will extend it
-            modification.put_rel_creation(rel, 0).await?;
+            modification.put_rel_creation(rel, 0, ctx).await?;
             0
         } else {
-            self.timeline.get_rel_size(rel, last_lsn, true).await?
+            self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
         };
 
         if new_nblocks > old_nblocks {
             //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
-            modification.put_rel_extend(rel, new_nblocks).await?;
+            modification.put_rel_extend(rel, new_nblocks, ctx).await?;
 
             // fill the gap with zeros
             for gap_blknum in old_nblocks..blknum {
@@ -1050,8 +1098,9 @@ impl<'a> WalIngest<'a> {
         segno: u32,
         blknum: BlockNumber,
         img: Bytes,
-    ) -> anyhow::Result<()> {
-        self.handle_slru_extend(modification, kind, segno, blknum)
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        self.handle_slru_extend(modification, kind, segno, blknum, ctx)
             .await?;
         modification.put_slru_page_image(kind, segno, blknum, img)?;
         Ok(())
@@ -1063,6 +1112,7 @@ impl<'a> WalIngest<'a> {
         kind: SlruKind,
         segno: u32,
         blknum: BlockNumber,
+        ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // we don't use a cache for this like we do for relations. SLRUS are explcitly
         // extended with ZEROPAGE records, not with commit records, so it happens
@@ -1075,17 +1125,17 @@ impl<'a> WalIngest<'a> {
         let last_lsn = self.timeline.get_last_record_lsn();
         let old_nblocks = if !self
             .timeline
-            .get_slru_segment_exists(kind, segno, last_lsn)
+            .get_slru_segment_exists(kind, segno, last_lsn, ctx)
             .await?
         {
             // create it with 0 size initially, the logic below will extend it
             modification
-                .put_slru_segment_creation(kind, segno, 0)
+                .put_slru_segment_creation(kind, segno, 0, ctx)
                 .await?;
             0
         } else {
             self.timeline
-                .get_slru_segment_size(kind, segno, last_lsn)
+                .get_slru_segment_size(kind, segno, last_lsn, ctx)
                 .await?
         };
 
@@ -1134,41 +1184,44 @@ mod tests {
 
     static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
 
-    async fn init_walingest_test(tline: &Timeline) -> Result<WalIngest> {
+    async fn init_walingest_test<'a>(
+        tline: &'a Timeline,
+        ctx: &RequestContext,
+    ) -> Result<WalIngest<'a>> {
         let mut m = tline.begin_modification(Lsn(0x10));
         m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
-        m.put_relmap_file(0, 111, Bytes::from("")).await?; // dummy relmapper file
+        m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
         m.commit()?;
-        let walingest = WalIngest::new(tline, Lsn(0x10)).await?;
+        let walingest = WalIngest::new(tline, Lsn(0x10), ctx).await?;
 
         Ok(walingest)
     }
 
     #[tokio::test]
     async fn test_relsize() -> Result<()> {
-        let tenant = TenantHarness::create("test_relsize")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline).await?;
+        let (tenant, ctx) = TenantHarness::create("test_relsize")?.load().await;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let mut walingest = init_walingest_test(&tline, &ctx).await?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
-        walingest.put_rel_creation(&mut m, TESTREL_A).await?;
+        walingest.put_rel_creation(&mut m, TESTREL_A, &ctx).await?;
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
             .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x30));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"))
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
             .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"))
+            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
             .await?;
         m.commit()?;
         let mut m = tline.begin_modification(Lsn(0x50));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"))
+            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
             .await?;
         m.commit()?;
 
@@ -1176,120 +1229,157 @@ mod tests {
 
         // The relation was created at LSN 2, not visible at LSN 1 yet.
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x10), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
+                .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Lsn(0x10), false)
+            .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
             .await
             .is_err());
-
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x20), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
             true
         );
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false).await?, 1);
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false).await?, 3);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
+            1
+        );
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .await?,
+            3
+        );
 
         // Check page contents at each LSN
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 2")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 2 at 5")
         );
 
         // Truncate last block
         let mut m = tline.begin_modification(Lsn(0x60));
-        walingest.put_rel_truncation(&mut m, TESTREL_A, 2).await?;
+        walingest
+            .put_rel_truncation(&mut m, TESTREL_A, 2, &ctx)
+            .await?;
         m.commit()?;
         assert_current_logical_size(&tline, Lsn(0x60));
 
         // Check reported size and contents after truncation
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false).await?, 2);
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false)
+                .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
+                .await?,
+            2
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
 
         // should still see the truncated block with older LSN
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x50), false).await?, 3);
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false)
+                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .await?,
+            3
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 2 at 5")
         );
 
         // Truncate to zero length
         let mut m = tline.begin_modification(Lsn(0x68));
-        walingest.put_rel_truncation(&mut m, TESTREL_A, 0).await?;
+        walingest
+            .put_rel_truncation(&mut m, TESTREL_A, 0, &ctx)
+            .await?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x68), false).await?, 0);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
+                .await?,
+            0
+        );
 
         // Extend from 0 to 2 blocks, leaving a gap
         let mut m = tline.begin_modification(Lsn(0x70));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"))
+            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
             .await?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x70), false).await?, 2);
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false)
+                .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
+                .await?,
+            2
+        );
+        assert_eq!(
+            tline
+                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false, &ctx)
                 .await?,
             ZERO_PAGE
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1")
         );
@@ -1297,21 +1387,26 @@ mod tests {
         // Extend a lot more, leaving a big gap that spans across segments
         let mut m = tline.begin_modification(Lsn(0x80));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"))
+            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
             .await?;
         m.commit()?;
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x80), false).await?, 1501);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
+                .await?,
+            1501
+        );
         for blk in 2..1500 {
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false)
+                    .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false, &ctx)
                     .await?,
                 ZERO_PAGE
             );
         }
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false)
+                .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1500")
         );
@@ -1323,31 +1418,40 @@ mod tests {
     // and then created it again within the same layer.
     #[tokio::test]
     async fn test_drop_extend() -> Result<()> {
-        let tenant = TenantHarness::create("test_drop_extend")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline).await?;
+        let (tenant, ctx) = TenantHarness::create("test_drop_extend")?.load().await;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let mut walingest = init_walingest_test(&tline, &ctx).await?;
 
         let mut m = tline.begin_modification(Lsn(0x20));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"))
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
             .await?;
         m.commit()?;
 
         // Check that rel exists and size is correct
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x20), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
             true
         );
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x20), false).await?, 1);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
+            1
+        );
 
         // Drop rel
         let mut m = tline.begin_modification(Lsn(0x30));
-        walingest.put_rel_drop(&mut m, TESTREL_A).await?;
+        walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
         m.commit()?;
 
         // Check that rel is not visible anymore
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x30), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x30), false, &ctx)
+                .await?,
             false
         );
 
@@ -1357,16 +1461,23 @@ mod tests {
         // Re-create it
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"))
+            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
             .await?;
         m.commit()?;
 
         // Check that rel exists and size is correct
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x40), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x40), false, &ctx)
+                .await?,
             true
         );
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x40), false).await?, 1);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x40), false, &ctx)
+                .await?,
+            1
+        );
 
         Ok(())
     }
@@ -1376,9 +1487,9 @@ mod tests {
     // and then extended it again within the same layer.
     #[tokio::test]
     async fn test_truncate_extend() -> Result<()> {
-        let tenant = TenantHarness::create("test_truncate_extend")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline).await?;
+        let (tenant, ctx) = TenantHarness::create("test_truncate_extend")?.load().await;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let mut walingest = init_walingest_test(&tline, &ctx).await?;
 
         // Create a 20 MB relation (the size is arbitrary)
         let relsize = 20 * 1024 * 1024 / 8192;
@@ -1386,27 +1497,33 @@ mod tests {
         for blkno in 0..relsize {
             let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
             walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                 .await?;
         }
         m.commit()?;
 
         // The relation was created at LSN 20, not visible at LSN 1 yet.
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x10), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
+                .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Lsn(0x10), false)
+            .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
             .await
             .is_err());
 
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x20), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
             true
         );
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(0x20), false).await?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .await?,
             relsize
         );
 
@@ -1416,7 +1533,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -1425,18 +1542,25 @@ mod tests {
         // Truncate relation so that second segment was dropped
         // - only leave one page
         let mut m = tline.begin_modification(Lsn(0x60));
-        walingest.put_rel_truncation(&mut m, TESTREL_A, 1).await?;
+        walingest
+            .put_rel_truncation(&mut m, TESTREL_A, 1, &ctx)
+            .await?;
         m.commit()?;
 
         // Check reported size and contents after truncation
-        assert_eq!(tline.get_rel_size(TESTREL_A, Lsn(0x60), false).await?, 1);
+        assert_eq!(
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
+                .await?,
+            1
+        );
 
         for blkno in 0..1 {
             let lsn = Lsn(0x20);
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -1444,7 +1568,9 @@ mod tests {
 
         // should still see all blocks with older LSN
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(0x50), false).await?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .await?,
             relsize
         );
         for blkno in 0..relsize {
@@ -1452,7 +1578,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -1465,17 +1591,21 @@ mod tests {
         for blkno in 0..relsize {
             let data = format!("foo blk {} at {}", blkno, lsn);
             walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data))
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
                 .await?;
         }
         m.commit()?;
 
         assert_eq!(
-            tline.get_rel_exists(TESTREL_A, Lsn(0x80), false).await?,
+            tline
+                .get_rel_exists(TESTREL_A, Lsn(0x80), false, &ctx)
+                .await?,
             true
         );
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(0x80), false).await?,
+            tline
+                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
+                .await?,
             relsize
         );
         // Check relation content
@@ -1484,7 +1614,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -1497,9 +1627,9 @@ mod tests {
     /// split into multiple 1 GB segments in Postgres.
     #[tokio::test]
     async fn test_large_rel() -> Result<()> {
-        let tenant = TenantHarness::create("test_large_rel")?.load().await;
-        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION)?;
-        let mut walingest = init_walingest_test(&tline).await?;
+        let (tenant, ctx) = TenantHarness::create("test_large_rel")?.load().await;
+        let tline = create_test_timeline(&tenant, TIMELINE_ID, DEFAULT_PG_VERSION, &ctx)?;
+        let mut walingest = init_walingest_test(&tline, &ctx).await?;
 
         let mut lsn = 0x10;
         for blknum in 0..RELSEG_SIZE + 1 {
@@ -1507,7 +1637,7 @@ mod tests {
             let mut m = tline.begin_modification(Lsn(lsn));
             let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
             walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img)
+                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
                 .await?;
             m.commit()?;
         }
@@ -1515,7 +1645,7 @@ mod tests {
         assert_current_logical_size(&tline, Lsn(lsn));
 
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
+            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
             RELSEG_SIZE + 1
         );
 
@@ -1523,11 +1653,11 @@ mod tests {
         lsn += 0x10;
         let mut m = tline.begin_modification(Lsn(lsn));
         walingest
-            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE)
+            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE, &ctx)
             .await?;
         m.commit()?;
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
+            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
             RELSEG_SIZE
         );
         assert_current_logical_size(&tline, Lsn(lsn));
@@ -1536,11 +1666,11 @@ mod tests {
         lsn += 0x10;
         let mut m = tline.begin_modification(Lsn(lsn));
         walingest
-            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1)
+            .put_rel_truncation(&mut m, TESTREL_A, RELSEG_SIZE - 1, &ctx)
             .await?;
         m.commit()?;
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
+            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
             RELSEG_SIZE - 1
         );
         assert_current_logical_size(&tline, Lsn(lsn));
@@ -1552,11 +1682,11 @@ mod tests {
             lsn += 0x10;
             let mut m = tline.begin_modification(Lsn(lsn));
             walingest
-                .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber)
+                .put_rel_truncation(&mut m, TESTREL_A, size as BlockNumber, &ctx)
                 .await?;
             m.commit()?;
             assert_eq!(
-                tline.get_rel_size(TESTREL_A, Lsn(lsn), false).await?,
+                tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
                 size as BlockNumber
             );
 
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index fd0524016f..c943bf0a27 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,16 +22,18 @@ use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
 use serde::Serialize;
+use std::collections::VecDeque;
 use std::fs::OpenOptions;
 use std::io::prelude::*;
 use std::io::{Error, ErrorKind};
 use std::ops::{Deref, DerefMut};
+use std::os::fd::RawFd;
 use std::os::unix::io::AsRawFd;
 use std::os::unix::prelude::CommandExt;
 use std::path::PathBuf;
 use std::process::Stdio;
 use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command};
-use std::sync::Mutex;
+use std::sync::{Mutex, MutexGuard};
 use std::time::Duration;
 use std::time::Instant;
 use std::{fs, io};
@@ -90,6 +92,20 @@ pub trait WalRedoManager: Send + Sync {
     ) -> Result<Bytes, WalRedoError>;
 }
 
+struct ProcessInput {
+    child: NoLeakChild,
+    stdin: ChildStdin,
+    stderr_fd: RawFd,
+    stdout_fd: RawFd,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
+
 ///
 /// This is the real implementation that uses a Postgres process to
 /// perform WAL replay. Only one thread can use the process at a time,
@@ -101,7 +117,9 @@ pub struct PostgresRedoManager {
     tenant_id: TenantId,
     conf: &'static PageServerConf,
 
-    process: Mutex<Option<PostgresRedoProcess>>,
+    stdout: Mutex<Option<ProcessOutput>>,
+    stdin: Mutex<Option<ProcessInput>>,
+    stderr: Mutex<Option<ChildStderr>>,
 }
 
 /// Can this request be served by neon redo functions
@@ -209,16 +227,17 @@ impl PostgresRedoManager {
         PostgresRedoManager {
             tenant_id,
             conf,
-            process: Mutex::new(None),
+            stdin: Mutex::new(None),
+            stdout: Mutex::new(None),
+            stderr: Mutex::new(None),
         }
     }
 
     /// Launch process pre-emptively. Should not be needed except for benchmarking.
-    pub fn launch_process(&mut self, pg_version: u32) -> anyhow::Result<()> {
-        let inner = self.process.get_mut().unwrap();
-        if inner.is_none() {
-            let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
-            *inner = Some(p);
+    pub fn launch_process(&self, pg_version: u32) -> anyhow::Result<()> {
+        let mut proc = self.stdin.lock().unwrap();
+        if proc.is_none() {
+            self.launch(&mut proc, pg_version)?;
         }
         Ok(())
     }
@@ -241,22 +260,19 @@ impl PostgresRedoManager {
 
         let start_time = Instant::now();
 
-        let mut process_guard = self.process.lock().unwrap();
+        let mut proc = self.stdin.lock().unwrap();
         let lock_time = Instant::now();
 
         // launch the WAL redo process on first use
-        if process_guard.is_none() {
-            let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
-            *process_guard = Some(p);
+        if proc.is_none() {
+            self.launch(&mut proc, pg_version)?;
         }
-        let process = process_guard.as_mut().unwrap();
-
         WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
 
         // Relational WAL records are applied using wal-redo-postgres
         let buf_tag = BufferTag { rel, blknum };
-        let result = process
-            .apply_wal_records(buf_tag, base_img, records, wal_redo_timeout)
+        let result = self
+            .apply_wal_records(proc, buf_tag, base_img, records, wal_redo_timeout)
             .map_err(WalRedoError::IoError);
 
         let end_time = Instant::now();
@@ -295,8 +311,22 @@ impl PostgresRedoManager {
 				base_img_lsn,
                 lsn
             );
-            let process = process_guard.take().unwrap();
-            process.kill();
+            // self.stdin only holds stdin & stderr as_raw_fd().
+            // Dropping it as part of take() doesn't close them.
+            // The owning objects (ChildStdout and ChildStderr) are stored in
+            // self.stdout and self.stderr, respsectively.
+            // We intentionally keep them open here to avoid a race between
+            // currently running `apply_wal_records()` and a `launch()` call
+            // after we return here.
+            // The currently running `apply_wal_records()` must not read from
+            // the newly launched process.
+            // By keeping self.stdout and self.stderr open here, `launch()` will
+            // get other file descriptors for the new child's stdout and stderr,
+            // and hence the current `apply_wal_records()` calls will observe
+            //  `output.stdout.as_raw_fd() != stdout_fd` .
+            if let Some(proc) = self.stdin.lock().unwrap().take() {
+                proc.child.kill_and_wait();
+            }
         }
         result
     }
@@ -595,32 +625,23 @@ impl<C: CommandExt> CloseFileDescriptors for C {
     }
 }
 
-///
-/// Handle to the Postgres WAL redo process
-///
-struct PostgresRedoProcess {
-    tenant_id: TenantId,
-    child: NoLeakChild,
-    stdin: ChildStdin,
-    stdout: ChildStdout,
-    stderr: ChildStderr,
-}
-
-impl PostgresRedoProcess {
+impl PostgresRedoManager {
     //
     // Start postgres binary in special WAL redo mode.
     //
-    #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
+    #[instrument(skip_all,fields(tenant_id=%self.tenant_id, pg_version=pg_version))]
     fn launch(
-        conf: &PageServerConf,
-        tenant_id: TenantId,
+        &self,
+        input: &mut MutexGuard<Option<ProcessInput>>,
         pg_version: u32,
-    ) -> Result<PostgresRedoProcess, Error> {
+    ) -> Result<(), Error> {
         // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
         // just create one with constant name. That fails if you try to launch more than
         // one WAL redo manager concurrently.
         let datadir = path_with_suffix_extension(
-            conf.tenant_path(&tenant_id).join("wal-redo-datadir"),
+            self.conf
+                .tenant_path(&self.tenant_id)
+                .join("wal-redo-datadir"),
             TEMP_FILE_SUFFIX,
         );
 
@@ -634,10 +655,12 @@ impl PostgresRedoProcess {
                 )
             })?;
         }
-        let pg_bin_dir_path = conf
+        let pg_bin_dir_path = self
+            .conf
             .pg_bin_dir(pg_version)
             .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_bin_dir path: {e}")))?;
-        let pg_lib_dir_path = conf
+        let pg_lib_dir_path = self
+            .conf
             .pg_lib_dir(pg_version)
             .map_err(|e| Error::new(ErrorKind::Other, format!("incorrect pg_lib_dir path: {e}")))?;
 
@@ -723,27 +746,31 @@ impl PostgresRedoProcess {
         // all fallible operations post-spawn are complete, so get rid of the guard
         let child = scopeguard::ScopeGuard::into_inner(child);
 
-        Ok(PostgresRedoProcess {
-            tenant_id,
+        **input = Some(ProcessInput {
             child,
+            stdout_fd: stdout.as_raw_fd(),
+            stderr_fd: stderr.as_raw_fd(),
             stdin,
+            n_requests: 0,
+        });
+
+        *self.stdout.lock().unwrap() = Some(ProcessOutput {
             stdout,
-            stderr,
-        })
+            pending_responses: VecDeque::new(),
+            n_processed_responses: 0,
+        });
+        *self.stderr.lock().unwrap() = Some(stderr);
+
+        Ok(())
     }
 
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
-    fn kill(self) {
-        self.child.kill_and_wait();
-    }
-
-    //
     // Apply given WAL records ('records') over an old page image. Returns
     // new page image.
     //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%input.as_ref().unwrap().child.id()))]
     fn apply_wal_records(
-        &mut self,
+        &self,
+        mut input: MutexGuard<Option<ProcessInput>>,
         tag: BufferTag,
         base_img: Option<Bytes>,
         records: &[(Lsn, NeonWalRecord)],
@@ -780,33 +807,23 @@ impl PostgresRedoProcess {
         build_get_page_msg(tag, &mut writebuf);
         WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
 
-        // The input is now in 'writebuf'. Do a blind write first, writing as much as
-        // we can, before calling poll(). That skips one call to poll() if the stdin is
-        // already available for writing, which it almost certainly is because the
-        // process is idle.
-        let mut nwrite = self.stdin.write(&writebuf)?;
-
-        // We expect the WAL redo process to respond with an 8k page image. We read it
-        // into this buffer.
-        let mut resultbuf = vec![0; BLCKSZ.into()];
-        let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+        let proc = input.as_mut().unwrap();
+        let mut nwrite = 0usize;
+        let stdout_fd = proc.stdout_fd;
 
         // Prepare for calling poll()
         let mut pollfds = [
-            PollFd::new(self.stdout.as_raw_fd(), PollFlags::POLLIN),
-            PollFd::new(self.stderr.as_raw_fd(), PollFlags::POLLIN),
-            PollFd::new(self.stdin.as_raw_fd(), PollFlags::POLLOUT),
+            PollFd::new(proc.stdin.as_raw_fd(), PollFlags::POLLOUT),
+            PollFd::new(proc.stderr_fd, PollFlags::POLLIN),
+            PollFd::new(stdout_fd, PollFlags::POLLIN),
         ];
 
-        // We do three things simultaneously: send the old base image and WAL records to
-        // the child process's stdin, read the result from child's stdout, and forward any logging
+        // We do two things simultaneously: send the old base image and WAL records to
+        // the child process's stdin and forward any logging
         // information that the child writes to its stderr to the page server's log.
-        while nresult < BLCKSZ.into() {
-            // If we have more data to write, wake up if 'stdin' becomes writeable or
-            // we have data to read. Otherwise only wake up if there's data to read.
-            let nfds = if nwrite < writebuf.len() { 3 } else { 2 };
+        while nwrite < writebuf.len() {
             let n = loop {
-                match nix::poll::poll(&mut pollfds[0..nfds], wal_redo_timeout.as_millis() as i32) {
+                match nix::poll::poll(&mut pollfds[0..2], wal_redo_timeout.as_millis() as i32) {
                     Err(e) if e == nix::errno::Errno::EINTR => continue,
                     res => break res,
                 }
@@ -820,14 +837,16 @@ impl PostgresRedoProcess {
             let err_revents = pollfds[1].revents().unwrap();
             if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
                 let mut errbuf: [u8; 16384] = [0; 16384];
-                let n = self.stderr.read(&mut errbuf)?;
+                let mut stderr_guard = self.stderr.lock().unwrap();
+                let stderr = stderr_guard.as_mut().unwrap();
+                let len = stderr.read(&mut errbuf)?;
 
                 // The message might not be split correctly into lines here. But this is
                 // good enough, the important thing is to get the message to the log.
-                if n > 0 {
+                if len > 0 {
                     error!(
                         "wal-redo-postgres: {}",
-                        String::from_utf8_lossy(&errbuf[0..n])
+                        String::from_utf8_lossy(&errbuf[0..len])
                     );
 
                     // To make sure we capture all log from the process if it fails, keep
@@ -841,33 +860,157 @@ impl PostgresRedoProcess {
                 ));
             }
 
-            // If we have more data to write and 'stdin' is writeable, do write.
-            if nwrite < writebuf.len() {
-                let in_revents = pollfds[2].revents().unwrap();
-                if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                    nwrite += self.stdin.write(&writebuf[nwrite..])?;
-                } else if in_revents.contains(PollFlags::POLLHUP) {
-                    // We still have more data to write, but the process closed the pipe.
-                    return Err(Error::new(
-                        ErrorKind::BrokenPipe,
-                        "WAL redo process closed its stdin unexpectedly",
-                    ));
-                }
-            }
-
-            // If we have some data in stdout, read it to the result buffer.
-            let out_revents = pollfds[0].revents().unwrap();
-            if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                nresult += self.stdout.read(&mut resultbuf[nresult..])?;
-            } else if out_revents.contains(PollFlags::POLLHUP) {
+            // If 'stdin' is writeable, do write.
+            let in_revents = pollfds[0].revents().unwrap();
+            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
+                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
+            } else if in_revents.contains(PollFlags::POLLHUP) {
+                // We still have more data to write, but the process closed the pipe.
                 return Err(Error::new(
                     ErrorKind::BrokenPipe,
-                    "WAL redo process closed its stdout unexpectedly",
+                    "WAL redo process closed its stdin unexpectedly",
                 ));
             }
         }
+        let request_no = proc.n_requests;
+        proc.n_requests += 1;
+        drop(input);
 
-        Ok(Bytes::from(resultbuf))
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut output_guard = self.stdout.lock().unwrap();
+        let output = output_guard.as_mut().unwrap();
+        if output.stdout.as_raw_fd() != stdout_fd {
+            // If stdout file descriptor is changed then it means that walredo process is crashed and restarted.
+            // As far as ProcessInput and ProcessOutout are protected by different mutexes,
+            // it can happen that we send request to one process and waiting response from another.
+            // To prevent such situation we compare stdout file descriptors.
+            // As far as old stdout pipe is destroyed only after new one is created,
+            // it can not reuse the same file descriptor, so this check is safe.
+            //
+            // Cross-read this with the comment in apply_batch_postgres if result.is_err().
+            // That's where we kill the child process.
+            return Err(Error::new(
+                ErrorKind::BrokenPipe,
+                "WAL redo process closed its stdout unexpectedly",
+            ));
+        }
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+            while nresult < BLCKSZ.into() {
+                // We do two things simultaneously: reading response from stdout
+                // and forward any logging information that the child writes to its stderr to the page server's log.
+                let n = loop {
+                    match nix::poll::poll(&mut pollfds[1..3], wal_redo_timeout.as_millis() as i32) {
+                        Err(e) if e == nix::errno::Errno::EINTR => continue,
+                        res => break res,
+                    }
+                }?;
+
+                if n == 0 {
+                    return Err(Error::new(ErrorKind::Other, "WAL redo timed out"));
+                }
+
+                // If we have some messages in stderr, forward them to the log.
+                let err_revents = pollfds[1].revents().unwrap();
+                if err_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                    let mut errbuf: [u8; 16384] = [0; 16384];
+                    let mut stderr_guard = self.stderr.lock().unwrap();
+                    let stderr = stderr_guard.as_mut().unwrap();
+                    let len = stderr.read(&mut errbuf)?;
+
+                    // The message might not be split correctly into lines here. But this is
+                    // good enough, the important thing is to get the message to the log.
+                    if len > 0 {
+                        error!(
+                            "wal-redo-postgres: {}",
+                            String::from_utf8_lossy(&errbuf[0..len])
+                        );
+
+                        // To make sure we capture all log from the process if it fails, keep
+                        // reading from the stderr, before checking the stdout.
+                        continue;
+                    }
+                } else if err_revents.contains(PollFlags::POLLHUP) {
+                    return Err(Error::new(
+                        ErrorKind::BrokenPipe,
+                        "WAL redo process closed its stderr unexpectedly",
+                    ));
+                }
+
+                // If we have some data in stdout, read it to the result buffer.
+                let out_revents = pollfds[2].revents().unwrap();
+                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
+                } else if out_revents.contains(PollFlags::POLLHUP) {
+                    return Err(Error::new(
+                        ErrorKind::BrokenPipe,
+                        "WAL redo process closed its stdout unexpectedly",
+                    ));
+                }
+            }
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        Ok(res)
     }
 }
 
diff --git a/poetry.lock b/poetry.lock
index edbcddd576..fc37124184 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,3 +1,21 @@
+[[package]]
+name = "aiohttp"
+version = "3.7.0"
+description = "Async http client/server framework (asyncio)"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
+[package.dependencies]
+async-timeout = ">=3.0,<4.0"
+attrs = ">=17.3.0"
+chardet = ">=2.0,<4.0"
+multidict = ">=4.5,<7.0"
+yarl = ">=1.0,<2.0"
+
+[package.extras]
+speedups = ["aiodns", "brotlipy", "cchardet"]
+
 [[package]]
 name = "aiopg"
 version = "1.3.4"
@@ -41,11 +59,11 @@ six = ">=1.9.0"
 
 [[package]]
 name = "async-timeout"
-version = "4.0.2"
+version = "3.0.1"
 description = "Timeout context manager for asyncio programs"
 category = "main"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.5.3"
 
 [[package]]
 name = "asyncpg"
@@ -560,6 +578,14 @@ networkx = ">=2.4,<3.0"
 pyyaml = ">5.4"
 sarif-om = ">=1.0.4,<1.1.0"
 
+[[package]]
+name = "chardet"
+version = "3.0.4"
+description = "Universal encoding detector for Python 2 and 3"
+category = "main"
+optional = false
+python-versions = "*"
+
 [[package]]
 name = "charset-normalizer"
 version = "2.1.0"
@@ -939,6 +965,14 @@ server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)"
 ssm = ["PyYAML (>=5.1)", "dataclasses"]
 xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"]
 
+[[package]]
+name = "multidict"
+version = "6.0.4"
+description = "multidict implementation"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
 [[package]]
 name = "mypy"
 version = "0.991"
@@ -1580,6 +1614,18 @@ category = "main"
 optional = false
 python-versions = ">=3.4"
 
+[[package]]
+name = "yarl"
+version = "1.8.2"
+description = "Yet another URL library"
+category = "main"
+optional = false
+python-versions = ">=3.7"
+
+[package.dependencies]
+idna = ">=2.0"
+multidict = ">=4.0"
+
 [[package]]
 name = "zipp"
 version = "3.8.1"
@@ -1595,9 +1641,44 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.9"
-content-hash = "af44b269c235a6fd59dacb4ff9e05cbc13a79b57254a8d5d4bde934bd5691a70"
+content-hash = "0f7289ef9439d1d7cd36b07efb53741b773669b0f860189c800270b7def0c241"
 
 [metadata.files]
+aiohttp = [
+    {file = "aiohttp-3.7.0-cp36-cp36m-macosx_10_14_x86_64.whl", hash = "sha256:72fe89f7e14939e896d984c4b592580f8cdfa7497feb1c0c24639a9c60be3eb9"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:fdf778d4c4bf976e69a37213fe8083613d0851976ddcf485bd7c0650a43d3852"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:fee7b5e68939ffc09f9b29f167ed49c8b50de3eee0a1d8108b439ddd9963af46"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_i686.whl", hash = "sha256:dd64634713be409202058f2ea267dfbcdd74b387b8793425f21ef0266d45d0e9"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_ppc64le.whl", hash = "sha256:713dd7fd70ddda9dc8d014c49dd0e55b58afe4e0cddb8722c7501f53edf30c3f"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_s390x.whl", hash = "sha256:d31c43f7c4948ce01957f9a1ceee0784e067778477557ebccdf805398331c1a1"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-manylinux2014_x86_64.whl", hash = "sha256:5e26d6003eb6df304608d9fd9c9437065a8532d869a3ffcbd8113a3d710f8239"},
+    {file = "aiohttp-3.7.0-cp36-cp36m-win_amd64.whl", hash = "sha256:bf08462cddd10ddd8ffe5cb5c1638bfa051290909ebedb31c06e46578b9b7529"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-macosx_10_14_x86_64.whl", hash = "sha256:07bacf6721db51a4c6160ed3031a2a97910647969dafd7c653f600f3b542f463"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:245b58e30bc889d18b783db2f09ef1d814f466e15c84325410827451297003a0"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b392e5c3e122586c49cd8b9426f577bf4d51958933b839d158d28b69515af74e"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:5b5c320621a171aa85f96909af28fbb5286bd6842066db3062b083ba92261256"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_ppc64le.whl", hash = "sha256:97d2341d1360dbe2c5b1d94922f7d68f9ce2ded1daab88b9bdeb49ce419cdc1b"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:beda23f292716887532661dc19abb9db2302ccfbd671a080cd8f4be7463d0841"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:cbcaae9a6f14f762348d19b2dce8162772c0b0a1739314e18492a308a22caf96"},
+    {file = "aiohttp-3.7.0-cp37-cp37m-win_amd64.whl", hash = "sha256:7a49ef7b691babc83db126db874fbf26ba2f781899b91399f9ff8b235f059245"},
+    {file = "aiohttp-3.7.0-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:f56892f57310415cf6a179eec3ea6c7a82a9d37fbc00894943ea3154011a6d2a"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux1_i686.whl", hash = "sha256:df1274b7620c32d3b15bfb0a8fb3165dd6cdc9c39f4db74d162f051c80826542"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:a04ba359dc5f2e21b96bfc90c4a7665441441ba61b52e992b7799493889a3419"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:f548d7976d168f0f45ac5909ca5f606ae3f6f7aa1725b22504004a053b29a7d0"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_ppc64le.whl", hash = "sha256:deef02e2a9f5095463098c7c22d5566f20a6e4e14fc0996c0c2efc74d461b680"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:fe44c96bc380588d36729392b602470d88a7c18e646e95dd4348cafe3900d91d"},
+    {file = "aiohttp-3.7.0-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:9210532e6e95b40d22a33415bb84423eef3f633b2d2339b97f3b26438eebc466"},
+    {file = "aiohttp-3.7.0-cp38-cp38-win_amd64.whl", hash = "sha256:a586e476a251483d222c73dfb2f27df90bc4ea1b8c7da9396236510e0d4046c8"},
+    {file = "aiohttp-3.7.0-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:900012c5f12ff72b1453229afe288ddc9135176df8b3b3cc5b8f6cfde912aaa4"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux1_i686.whl", hash = "sha256:064d5f0738bcbab3e0c0ecf85c93b5ee1e07e124f994eaa03bf73687f3ecd9da"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:0a2edf27865e66a33f64fa793cd14d0aae8127ce20a858539e97c25b600556dc"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:eaa8ae734639d5a0a3b5e33a154b8bfef384cdc090706f95c387cae8b21af764"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_ppc64le.whl", hash = "sha256:a8a42f05491d9c04a77806875a68f84fea9af7a59d47b7897cb166632f74606c"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:b19ded3f6957693b97ba8372aacb5b0021639bbd5e77b1e960796bcef5431969"},
+    {file = "aiohttp-3.7.0-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:cefbd7ce7d1f1db43749a077e4970e29e2b631f367c9eff3862c3c886b4218dd"},
+    {file = "aiohttp-3.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:7d64f7dfd4e326d9b0d11b07fcd5ebf78844ba3c8f7699f38b50b0e0db0ae68f"},
+    {file = "aiohttp-3.7.0.tar.gz", hash = "sha256:176f1d2b2bc07044f4ed583216578a72a2bd35dffdeb92e0517d0aaa29d29549"},
+]
 aiopg = [
     {file = "aiopg-1.3.4-py3-none-any.whl", hash = "sha256:b5b74a124831aad71608c3c203479db90bac4a7eb3f8982bc48c3d3e6f1e57bf"},
     {file = "aiopg-1.3.4.tar.gz", hash = "sha256:23f9e4cd9f28e9d91a6de3b4fb517e8bed25511cd954acccba9fe3a702d9b7d0"},
@@ -1611,8 +1692,8 @@ allure-python-commons = [
     {file = "allure_python_commons-2.10.0-py3-none-any.whl", hash = "sha256:2a717e8ca8d296bf89cd57f38fc3c21893bd7ea8cd02a6ae5420e6d1a6eda5d0"},
 ]
 async-timeout = [
-    {file = "async-timeout-4.0.2.tar.gz", hash = "sha256:2163e1640ddb52b7a8c80d0a67a08587e5d245cc9c553a74a847056bc2976b15"},
-    {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"},
+    {file = "async-timeout-3.0.1.tar.gz", hash = "sha256:0c3c816a028d47f659d6ff5c745cb2acf1f966da1fe5c19c77a70282b25f4c5f"},
+    {file = "async_timeout-3.0.1-py3-none-any.whl", hash = "sha256:4291ca197d287d274d0b6cb5d6f8f8f82d434ed288f962539ff18cc9012f9ea3"},
 ]
 asyncpg = [
     {file = "asyncpg-0.27.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fca608d199ffed4903dce1bcd97ad0fe8260f405c1c225bdf0002709132171c2"},
@@ -1787,6 +1868,10 @@ cfn-lint = [
     {file = "cfn-lint-0.61.3.tar.gz", hash = "sha256:3806e010d77901f5e935496df690c10e39676434a738fce1a1161cf9c7bd36a2"},
     {file = "cfn_lint-0.61.3-py3-none-any.whl", hash = "sha256:8e9522fad0c7c98b31ecbdd4724f8d8a5787457cc0f71e62ae0d11104d6e52ab"},
 ]
+chardet = [
+    {file = "chardet-3.0.4-py2.py3-none-any.whl", hash = "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"},
+    {file = "chardet-3.0.4.tar.gz", hash = "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae"},
+]
 charset-normalizer = [
     {file = "charset-normalizer-2.1.0.tar.gz", hash = "sha256:575e708016ff3a5e3681541cb9d79312c416835686d054a23accb873b254f413"},
     {file = "charset_normalizer-2.1.0-py3-none-any.whl", hash = "sha256:5189b6f22b01957427f35b6a08d9a0bc45b46d3788ef5a92e978433c7a35f8a5"},
@@ -1960,6 +2045,82 @@ moto = [
     {file = "moto-3.1.18-py3-none-any.whl", hash = "sha256:b6eb096e7880c46ac44d6d90988c0043e31462115cfdc913a0ee8f470bd9555c"},
     {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"},
 ]
+multidict = [
+    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"},
+    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"},
+    {file = "multidict-6.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93"},
+    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0"},
+    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5"},
+    {file = "multidict-6.0.4-cp310-cp310-win32.whl", hash = "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8"},
+    {file = "multidict-6.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc"},
+    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03"},
+    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3"},
+    {file = "multidict-6.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710"},
+    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed"},
+    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461"},
+    {file = "multidict-6.0.4-cp311-cp311-win32.whl", hash = "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636"},
+    {file = "multidict-6.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0"},
+    {file = "multidict-6.0.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9"},
+    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87"},
+    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d"},
+    {file = "multidict-6.0.4-cp37-cp37m-win32.whl", hash = "sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775"},
+    {file = "multidict-6.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e"},
+    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c"},
+    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161"},
+    {file = "multidict-6.0.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258"},
+    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d"},
+    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1"},
+    {file = "multidict-6.0.4-cp38-cp38-win32.whl", hash = "sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779"},
+    {file = "multidict-6.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480"},
+    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664"},
+    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35"},
+    {file = "multidict-6.0.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1"},
+    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176"},
+    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95"},
+    {file = "multidict-6.0.4-cp39-cp39-win32.whl", hash = "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313"},
+    {file = "multidict-6.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2"},
+    {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
+]
 mypy = [
     {file = "mypy-0.991-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7d17e0a9707d0772f4a7b878f04b4fd11f6f5bcb9b3813975a9b13c9332153ab"},
     {file = "mypy-0.991-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0714258640194d75677e86c786e80ccf294972cc76885d3ebbb560f11db0003d"},
@@ -2412,6 +2573,82 @@ xmltodict = [
     {file = "xmltodict-0.13.0-py2.py3-none-any.whl", hash = "sha256:aa89e8fd76320154a40d19a0df04a4695fb9dc5ba977cbb68ab3e4eb225e7852"},
     {file = "xmltodict-0.13.0.tar.gz", hash = "sha256:341595a488e3e01a85a9d8911d8912fd922ede5fecc4dce437eb4b6c8d037e56"},
 ]
+yarl = [
+    {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:bb81f753c815f6b8e2ddd2eef3c855cf7da193b82396ac013c661aaa6cc6b0a5"},
+    {file = "yarl-1.8.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:47d49ac96156f0928f002e2424299b2c91d9db73e08c4cd6742923a086f1c863"},
+    {file = "yarl-1.8.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3fc056e35fa6fba63248d93ff6e672c096f95f7836938241ebc8260e062832fe"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58a3c13d1c3005dbbac5c9f0d3210b60220a65a999b1833aa46bd6677c69b08e"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10b08293cda921157f1e7c2790999d903b3fd28cd5c208cf8826b3b508026996"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:de986979bbd87272fe557e0a8fcb66fd40ae2ddfe28a8b1ce4eae22681728fef"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c4fcfa71e2c6a3cb568cf81aadc12768b9995323186a10827beccf5fa23d4f8"},
+    {file = "yarl-1.8.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae4d7ff1049f36accde9e1ef7301912a751e5bae0a9d142459646114c70ecba6"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bf071f797aec5b96abfc735ab97da9fd8f8768b43ce2abd85356a3127909d146"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:74dece2bfc60f0f70907c34b857ee98f2c6dd0f75185db133770cd67300d505f"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:df60a94d332158b444301c7f569659c926168e4d4aad2cfbf4bce0e8fb8be826"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:63243b21c6e28ec2375f932a10ce7eda65139b5b854c0f6b82ed945ba526bff3"},
+    {file = "yarl-1.8.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cfa2bbca929aa742b5084fd4663dd4b87c191c844326fcb21c3afd2d11497f80"},
+    {file = "yarl-1.8.2-cp310-cp310-win32.whl", hash = "sha256:b05df9ea7496df11b710081bd90ecc3a3db6adb4fee36f6a411e7bc91a18aa42"},
+    {file = "yarl-1.8.2-cp310-cp310-win_amd64.whl", hash = "sha256:24ad1d10c9db1953291f56b5fe76203977f1ed05f82d09ec97acb623a7976574"},
+    {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2a1fca9588f360036242f379bfea2b8b44cae2721859b1c56d033adfd5893634"},
+    {file = "yarl-1.8.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f37db05c6051eff17bc832914fe46869f8849de5b92dc4a3466cd63095d23dfd"},
+    {file = "yarl-1.8.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:77e913b846a6b9c5f767b14dc1e759e5aff05502fe73079f6f4176359d832581"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0978f29222e649c351b173da2b9b4665ad1feb8d1daa9d971eb90df08702668a"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:388a45dc77198b2460eac0aca1efd6a7c09e976ee768b0d5109173e521a19daf"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2305517e332a862ef75be8fad3606ea10108662bc6fe08509d5ca99503ac2aee"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42430ff511571940d51e75cf42f1e4dbdded477e71c1b7a17f4da76c1da8ea76"},
+    {file = "yarl-1.8.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3150078118f62371375e1e69b13b48288e44f6691c1069340081c3fd12c94d5b"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:c15163b6125db87c8f53c98baa5e785782078fbd2dbeaa04c6141935eb6dab7a"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d04acba75c72e6eb90745447d69f84e6c9056390f7a9724605ca9c56b4afcc6"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e7fd20d6576c10306dea2d6a5765f46f0ac5d6f53436217913e952d19237efc4"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:75c16b2a900b3536dfc7014905a128a2bea8fb01f9ee26d2d7d8db0a08e7cb2c"},
+    {file = "yarl-1.8.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6d88056a04860a98341a0cf53e950e3ac9f4e51d1b6f61a53b0609df342cc8b2"},
+    {file = "yarl-1.8.2-cp311-cp311-win32.whl", hash = "sha256:fb742dcdd5eec9f26b61224c23baea46c9055cf16f62475e11b9b15dfd5c117b"},
+    {file = "yarl-1.8.2-cp311-cp311-win_amd64.whl", hash = "sha256:8c46d3d89902c393a1d1e243ac847e0442d0196bbd81aecc94fcebbc2fd5857c"},
+    {file = "yarl-1.8.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ceff9722e0df2e0a9e8a79c610842004fa54e5b309fe6d218e47cd52f791d7ef"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3f6b4aca43b602ba0f1459de647af954769919c4714706be36af670a5f44c9c1"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1684a9bd9077e922300ecd48003ddae7a7474e0412bea38d4631443a91d61077"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ebb78745273e51b9832ef90c0898501006670d6e059f2cdb0e999494eb1450c2"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3adeef150d528ded2a8e734ebf9ae2e658f4c49bf413f5f157a470e17a4a2e89"},
+    {file = "yarl-1.8.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:57a7c87927a468e5a1dc60c17caf9597161d66457a34273ab1760219953f7f4c"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:efff27bd8cbe1f9bd127e7894942ccc20c857aa8b5a0327874f30201e5ce83d0"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a783cd344113cb88c5ff7ca32f1f16532a6f2142185147822187913eb989f739"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:705227dccbe96ab02c7cb2c43e1228e2826e7ead880bb19ec94ef279e9555b5b"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:34c09b43bd538bf6c4b891ecce94b6fa4f1f10663a8d4ca589a079a5018f6ed7"},
+    {file = "yarl-1.8.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a48f4f7fea9a51098b02209d90297ac324241bf37ff6be6d2b0149ab2bd51b37"},
+    {file = "yarl-1.8.2-cp37-cp37m-win32.whl", hash = "sha256:0414fd91ce0b763d4eadb4456795b307a71524dbacd015c657bb2a39db2eab89"},
+    {file = "yarl-1.8.2-cp37-cp37m-win_amd64.whl", hash = "sha256:d881d152ae0007809c2c02e22aa534e702f12071e6b285e90945aa3c376463c5"},
+    {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5df5e3d04101c1e5c3b1d69710b0574171cc02fddc4b23d1b2813e75f35a30b1"},
+    {file = "yarl-1.8.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7a66c506ec67eb3159eea5096acd05f5e788ceec7b96087d30c7d2865a243918"},
+    {file = "yarl-1.8.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:2b4fa2606adf392051d990c3b3877d768771adc3faf2e117b9de7eb977741229"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1e21fb44e1eff06dd6ef971d4bdc611807d6bd3691223d9c01a18cec3677939e"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93202666046d9edadfe9f2e7bf5e0782ea0d497b6d63da322e541665d65a044e"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fc77086ce244453e074e445104f0ecb27530d6fd3a46698e33f6c38951d5a0f1"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64dd68a92cab699a233641f5929a40f02a4ede8c009068ca8aa1fe87b8c20ae3"},
+    {file = "yarl-1.8.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1b372aad2b5f81db66ee7ec085cbad72c4da660d994e8e590c997e9b01e44901"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e6f3515aafe0209dd17fb9bdd3b4e892963370b3de781f53e1746a521fb39fc0"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dfef7350ee369197106805e193d420b75467b6cceac646ea5ed3049fcc950a05"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:728be34f70a190566d20aa13dc1f01dc44b6aa74580e10a3fb159691bc76909d"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:ff205b58dc2929191f68162633d5e10e8044398d7a45265f90a0f1d51f85f72c"},
+    {file = "yarl-1.8.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:baf211dcad448a87a0d9047dc8282d7de59473ade7d7fdf22150b1d23859f946"},
+    {file = "yarl-1.8.2-cp38-cp38-win32.whl", hash = "sha256:272b4f1599f1b621bf2aabe4e5b54f39a933971f4e7c9aa311d6d7dc06965165"},
+    {file = "yarl-1.8.2-cp38-cp38-win_amd64.whl", hash = "sha256:326dd1d3caf910cd26a26ccbfb84c03b608ba32499b5d6eeb09252c920bcbe4f"},
+    {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f8ca8ad414c85bbc50f49c0a106f951613dfa5f948ab69c10ce9b128d368baf8"},
+    {file = "yarl-1.8.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:418857f837347e8aaef682679f41e36c24250097f9e2f315d39bae3a99a34cbf"},
+    {file = "yarl-1.8.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0eec05ab49e91a78700761777f284c2df119376e391db42c38ab46fd662b77"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:009a028127e0a1755c38b03244c0bea9d5565630db9c4cf9572496e947137a87"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3edac5d74bb3209c418805bda77f973117836e1de7c000e9755e572c1f7850d0"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:da65c3f263729e47351261351b8679c6429151ef9649bba08ef2528ff2c423b2"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ef8fb25e52663a1c85d608f6dd72e19bd390e2ecaf29c17fb08f730226e3a08"},
+    {file = "yarl-1.8.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bcd7bb1e5c45274af9a1dd7494d3c52b2be5e6bd8d7e49c612705fd45420b12d"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:44ceac0450e648de86da8e42674f9b7077d763ea80c8ceb9d1c3e41f0f0a9951"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:97209cc91189b48e7cfe777237c04af8e7cc51eb369004e061809bcdf4e55220"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:48dd18adcf98ea9cd721a25313aef49d70d413a999d7d89df44f469edfb38a06"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e59399dda559688461762800d7fb34d9e8a6a7444fd76ec33220a926c8be1516"},
+    {file = "yarl-1.8.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d617c241c8c3ad5c4e78a08429fa49e4b04bedfc507b34b4d8dceb83b4af3588"},
+    {file = "yarl-1.8.2-cp39-cp39-win32.whl", hash = "sha256:cb6d48d80a41f68de41212f3dfd1a9d9898d7841c8f7ce6696cf2fd9cb57ef83"},
+    {file = "yarl-1.8.2-cp39-cp39-win_amd64.whl", hash = "sha256:6604711362f2dbf7160df21c416f81fac0de6dbcf0b5445a2ef25478ecc4c778"},
+    {file = "yarl-1.8.2.tar.gz", hash = "sha256:49d43402c6e3013ad0978602bf6bf5328535c48d192304b91b97a3c6790b1562"},
+]
 zipp = [
     {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"},
     {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"},
diff --git a/proxy/src/main.rs b/proxy/src/main.rs
index 5d44774df9..1b61ab108f 100644
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -30,7 +30,7 @@ use std::{borrow::Cow, future::Future, net::SocketAddr};
 use tokio::{net::TcpListener, task::JoinError};
 use tracing::{info, info_span, Instrument};
 use utils::project_git_version;
-use utils::sentry_init::{init_sentry, release_name};
+use utils::sentry_init::init_sentry;
 
 project_git_version!(GIT_VERSION);
 
@@ -49,7 +49,7 @@ async fn main() -> anyhow::Result<()> {
         .init();
 
     // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(release_name!(), &[]);
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
     let arg_matches = cli().get_matches();
 
diff --git a/pyproject.toml b/pyproject.toml
index b4fb7a9e7d..a817e9dda5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,6 +33,7 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.4"
 types-toml = "^0.10.8"
 pytest-httpserver = "^1.0.6"
+aiohttp = "3.7"
 
 [tool.poetry.dev-dependencies]
 flake8 = "^5.0.4"
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index b130ea86bd..1a068412c8 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -38,7 +38,7 @@ use utils::{
     id::NodeId,
     logging::{self, LogFormat},
     project_git_version,
-    sentry_init::{init_sentry, release_name},
+    sentry_init::init_sentry,
     signals, tcp_listener,
 };
 
@@ -173,7 +173,10 @@ fn main() -> anyhow::Result<()> {
     };
 
     // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(release_name!(), &[("node_id", &conf.my_id.to_string())]);
+    let _sentry_guard = init_sentry(
+        Some(GIT_VERSION.into()),
+        &[("node_id", &conf.my_id.to_string())],
+    );
     start_safekeeper(conf)
 }
 
diff --git a/scripts/force_layer_download.py b/scripts/force_layer_download.py
new file mode 100644
index 0000000000..5472d86d8f
--- /dev/null
+++ b/scripts/force_layer_download.py
@@ -0,0 +1,324 @@
+import argparse
+import asyncio
+import json
+import logging
+import signal
+import sys
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Awaitable, Dict, List, Tuple
+
+import aiohttp
+
+
+class ClientException(Exception):
+    pass
+
+
+class Client:
+    def __init__(self, pageserver_api_endpoint: str, max_concurrent_layer_downloads: int):
+        self.endpoint = pageserver_api_endpoint
+        self.max_concurrent_layer_downloads = max_concurrent_layer_downloads
+        self.sess = aiohttp.ClientSession()
+
+    async def close(self):
+        await self.sess.close()
+
+    async def __aenter__(self):
+        return self
+
+    async def __aexit__(self, exc_t, exc_v, exc_tb):
+        await self.close()
+
+    async def parse_response(self, resp, expected_type):
+        body = await resp.json()
+        if not resp.ok:
+            raise ClientException(f"Response: {resp} Body: {body}")
+
+        if not isinstance(body, expected_type):
+            raise ClientException(f"expecting {expected_type.__name__}")
+        return body
+
+    async def get_tenant_ids(self):
+        resp = await self.sess.get(f"{self.endpoint}/v1/tenant")
+        payload = await self.parse_response(resp=resp, expected_type=list)
+        return [t["id"] for t in payload]
+
+    async def get_timeline_ids(self, tenant_id):
+        resp = await self.sess.get(f"{self.endpoint}/v1/tenant/{tenant_id}/timeline")
+        payload = await self.parse_response(resp=resp, expected_type=list)
+        return [t["timeline_id"] for t in payload]
+
+    async def timeline_spawn_download_remote_layers(self, tenant_id, timeline_id, ongoing_ok=False):
+        resp = await self.sess.post(
+            f"{self.endpoint}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
+            json={"max_concurrent_downloads": self.max_concurrent_layer_downloads},
+        )
+        body = await resp.json()
+        if resp.status == 409:
+            if not ongoing_ok:
+                raise ClientException("download already ongoing")
+            # response body has same shape for ongoing and newly created
+        elif not resp.ok:
+            raise ClientException(f"Response: {resp} Body: {body}")
+
+        if not isinstance(body, dict):
+            raise ClientException("expecting dict")
+
+        return body
+
+    async def timeline_poll_download_remote_layers_status(
+        self,
+        tenant_id,
+        timeline_id,
+    ):
+        resp = await self.sess.get(
+            f"{self.endpoint}/v1/tenant/{tenant_id}/timeline/{timeline_id}/download_remote_layers",
+        )
+        body = await resp.json()
+
+        if resp.status == 404:
+            return None
+        elif not resp.ok:
+            raise ClientException(f"Response: {resp} Body: {body}")
+
+        return body
+
+
+@dataclass
+class Completed:
+    """The status dict returned by the API"""
+
+    status: Dict[str, Any]
+
+
+sigint_received = asyncio.Event()
+
+
+async def do_timeline(client: Client, tenant_id, timeline_id):
+    """
+    Spawn download_remote_layers task for given timeline,
+    then poll until the download has reached a terminal state.
+
+    If the terminal state is not 'Completed', the method raises an exception.
+    The caller is responsible for inspecting `failed_download_count`.
+
+    If there is already a task going on when this method is invoked,
+    it raises an exception.
+    """
+
+    # Don't start new downloads if user pressed SIGINT.
+    # This task will show up as "raised_exception" in the report.
+    if sigint_received.is_set():
+        raise Exception("not starting because SIGINT received")
+
+    # run downloads to completion
+
+    status = await client.timeline_poll_download_remote_layers_status(tenant_id, timeline_id)
+    if status is not None and status["state"] == "Running":
+        raise Exception("download is already running")
+
+    spawned = await client.timeline_spawn_download_remote_layers(
+        tenant_id, timeline_id, ongoing_ok=False
+    )
+
+    while True:
+        st = await client.timeline_poll_download_remote_layers_status(tenant_id, timeline_id)
+        logging.info(f"{tenant_id}:{timeline_id} state is: {st}")
+
+        if spawned["task_id"] != st["task_id"]:
+            raise ClientException("download task ids changed while polling")
+
+        if st["state"] == "Running":
+            await asyncio.sleep(10)
+            continue
+
+        if st["state"] != "Completed":
+            raise ClientException(
+                f"download task reached terminal state != Completed: {st['state']}"
+            )
+
+        return Completed(st)
+
+
+def handle_sigint():
+    logging.info("SIGINT received, asyncio event set. Will not start new downloads.")
+    global sigint_received
+    sigint_received.set()
+
+
+async def main(args):
+    async with Client(args.pageserver_http_endpoint, args.max_concurrent_layer_downloads) as client:
+        exit_code = await main_impl(args, args.report_output, client)
+
+    return exit_code
+
+
+async def taskq_handler(task_q, result_q):
+    while True:
+        try:
+            (id, fut) = task_q.get_nowait()
+        except asyncio.QueueEmpty:
+            logging.debug("taskq_handler observed empty task_q, returning")
+            return
+        logging.info(f"starting task {id}")
+        try:
+            res = await fut
+        except Exception as e:
+            res = e
+        result_q.put_nowait((id, res))
+
+
+async def print_progress(result_q, tasks):
+    while True:
+        await asyncio.sleep(10)
+        logging.info(f"{result_q.qsize()} / {len(tasks)} tasks done")
+
+
+async def main_impl(args, report_out, client: Client):
+    """
+    Returns OS exit status.
+    """
+    tenant_and_timline_ids: List[Tuple[str, str]] = []
+    # fill tenant_and_timline_ids based on spec
+    for spec in args.what:
+        comps = spec.split(":")
+        if comps == ["ALL"]:
+            logging.info("get tenant list")
+            tenant_ids = await client.get_tenant_ids()
+            get_timeline_id_coros = [client.get_timeline_ids(tenant_id) for tenant_id in tenant_ids]
+            gathered = await asyncio.gather(*get_timeline_id_coros, return_exceptions=True)
+            assert len(tenant_ids) == len(gathered)
+            tenant_and_timline_ids = []
+            for tid, tlids in zip(tenant_ids, gathered):
+                for tlid in tlids:
+                    tenant_and_timline_ids.append((tid, tlid))
+        elif len(comps) == 1:
+            tid = comps[0]
+            tlids = await client.get_timeline_ids(tid)
+            for tlid in tlids:
+                tenant_and_timline_ids.append((tid, tlid))
+        elif len(comps) == 2:
+            tenant_and_timline_ids.append((comps[0], comps[1]))
+        else:
+            raise ValueError(f"invalid what-spec: {spec}")
+
+    logging.info("expanded spec:")
+    for tid, tlid in tenant_and_timline_ids:
+        logging.info(f"{tid}:{tlid}")
+
+    logging.info("remove duplicates after expanding spec")
+    tmp = list(set(tenant_and_timline_ids))
+    assert len(tmp) <= len(tenant_and_timline_ids)
+    if len(tmp) != len(tenant_and_timline_ids):
+        logging.info(f"spec had {len(tenant_and_timline_ids) - len(tmp)} duplicates")
+    tenant_and_timline_ids = tmp
+
+    logging.info("create tasks and process them at specified concurrency")
+    task_q: asyncio.Queue[Tuple[str, Awaitable[Any]]] = asyncio.Queue()
+    tasks = {
+        f"{tid}:{tlid}": do_timeline(client, tid, tlid) for tid, tlid in tenant_and_timline_ids
+    }
+    for task in tasks.items():
+        task_q.put_nowait(task)
+
+    result_q: asyncio.Queue[Tuple[str, Any]] = asyncio.Queue()
+    taskq_handlers = []
+    for _ in range(0, args.concurrent_tasks):
+        taskq_handlers.append(taskq_handler(task_q, result_q))
+
+    print_progress_task = asyncio.create_task(print_progress(result_q, tasks))
+
+    await asyncio.gather(*taskq_handlers)
+    print_progress_task.cancel()
+
+    logging.info("all tasks handled, generating report")
+
+    results = []
+    while True:
+        try:
+            results.append(result_q.get_nowait())
+        except asyncio.QueueEmpty:
+            break
+    assert task_q.empty()
+
+    report = defaultdict(list)
+    for id, result in results:
+        logging.info(f"result for {id}: {result}")
+        if isinstance(result, Completed):
+            if result.status["failed_download_count"] == 0:
+                report["completed_without_errors"].append(id)
+            else:
+                report["completed_with_download_errors"].append(id)
+        elif isinstance(result, Exception):
+            report["raised_exception"].append(id)
+        else:
+            raise ValueError("unexpected result type")
+    json.dump(report, report_out)
+
+    logging.info("--------------------------------------------------------------------------------")
+
+    report_success = len(report["completed_without_errors"]) == len(tenant_and_timline_ids)
+    if not report_success:
+        logging.error("One or more tasks encountered errors.")
+    else:
+        logging.info("All tasks reported success.")
+    logging.info("Inspect log for details and report file for JSON summary.")
+
+    return report_success
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--report-output",
+        type=argparse.FileType("w"),
+        default="-",
+        help="where to write report output (default: stdout)",
+    )
+    parser.add_argument(
+        "--pageserver-http-endpoint",
+        default="http://localhost:9898",
+        help="pageserver http endpoint, (default http://localhost:9898)",
+    )
+    parser.add_argument(
+        "--concurrent-tasks",
+        required=False,
+        default=5,
+        type=int,
+        help="Max concurrent download tasks created & polled by this script",
+    )
+    parser.add_argument(
+        "--max-concurrent-layer-downloads",
+        dest="max_concurrent_layer_downloads",
+        required=False,
+        default=8,
+        type=int,
+        help="Max concurrent download tasks spawned by pageserver. Each layer is a separate task.",
+    )
+
+    parser.add_argument(
+        "what",
+        nargs="+",
+        help="what to download: ALL|tenant_id|tenant_id:timeline_id",
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="enable verbose logging",
+    )
+    args = parser.parse_args()
+
+    level = logging.INFO
+    if args.verbose:
+        level = logging.DEBUG
+    logging.basicConfig(
+        format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
+        datefmt="%Y-%m-%d:%H:%M:%S",
+        level=level,
+    )
+
+    loop = asyncio.get_event_loop()
+
+    loop.add_signal_handler(signal.SIGINT, handle_sigint)
+    sys.exit(asyncio.run(main(args)))
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index 6d80e96bf1..e33369bbb1 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -45,7 +45,7 @@ use storage_broker::{
 use utils::id::TenantTimelineId;
 use utils::logging::{self, LogFormat};
 use utils::project_git_version;
-use utils::sentry_init::{init_sentry, release_name};
+use utils::sentry_init::init_sentry;
 
 project_git_version!(GIT_VERSION);
 
@@ -425,7 +425,7 @@ async fn http1_handler(
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
     // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(release_name!(), &[]);
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
     let args = Args::parse();
 
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 8b78e06c22..bdaaa95216 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -46,6 +46,12 @@ PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS: Tuple[str, ...] = (
     "pageserver_remote_physical_size",
 )
 
+PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
+    "pageserver_storage_operations_seconds_global_count",
+    "pageserver_storage_operations_seconds_global_sum",
+    "pageserver_storage_operations_seconds_global_bucket",
+)
+
 PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_current_logical_size",
     "pageserver_resident_physical_size",
@@ -61,13 +67,13 @@ PAGESERVER_PER_TENANT_METRICS: Tuple[str, ...] = (
     "pageserver_smgr_query_seconds_bucket",
     "pageserver_smgr_query_seconds_count",
     "pageserver_smgr_query_seconds_sum",
-    "pageserver_storage_operations_seconds_bucket",
-    "pageserver_storage_operations_seconds_count",
-    "pageserver_storage_operations_seconds_sum",
+    "pageserver_storage_operations_seconds_count_total",
+    "pageserver_storage_operations_seconds_sum_total",
     "pageserver_wait_lsn_seconds_bucket",
     "pageserver_wait_lsn_seconds_count",
     "pageserver_wait_lsn_seconds_sum",
     "pageserver_created_persistent_files_total",
     "pageserver_written_persistent_bytes_total",
+    "pageserver_tenant_states_count",
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
 )
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 29cdcb18ce..cbbf01a285 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -2,7 +2,15 @@ from contextlib import closing
 
 import psycopg2.extras
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import (
+    LocalFsStorage,
+    NeonEnvBuilder,
+    RemoteStorageKind,
+    assert_tenant_status,
+    wait_for_upload,
+)
+from fixtures.types import Lsn
+from fixtures.utils import wait_until
 
 
 def test_tenant_config(neon_env_builder: NeonEnvBuilder):
@@ -57,7 +65,7 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "compaction_period": 20,
                     "compaction_threshold": 10,
                     "gc_horizon": 67108864,
-                    "gc_period": 100,
+                    "gc_period": 60 * 60,
                     "image_creation_threshold": 3,
                     "pitr_interval": 604800,  # 7 days
                 }.items()
@@ -158,3 +166,46 @@ tenant_config={checkpoint_distance = 10000, compaction_target_size = 1048576}"""
                     "pitr_interval": 60,
                 }.items()
             )
+
+
+def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.LOCAL_FS,
+        test_name="test_creating_tenant_conf_after_attach",
+    )
+
+    env = neon_env_builder.init_start()
+    assert isinstance(env.remote_storage, LocalFsStorage)
+
+    # tenant is created with defaults, as in without config file
+    (tenant_id, timeline_id) = env.neon_cli.create_tenant()
+    config_path = env.repo_dir / "tenants" / str(tenant_id) / "config"
+    assert config_path.exists(), "config file is always initially created"
+
+    http_client = env.pageserver.http_client()
+
+    detail = http_client.timeline_detail(tenant_id, timeline_id)
+    last_record_lsn = Lsn(detail["last_record_lsn"])
+    assert last_record_lsn.lsn_int != 0, "initdb must have executed"
+
+    wait_for_upload(http_client, tenant_id, timeline_id, last_record_lsn)
+
+    http_client.tenant_detach(tenant_id)
+
+    assert not config_path.exists(), "detach did not remove config file"
+
+    http_client.tenant_attach(tenant_id)
+    wait_until(
+        number_of_iterations=5,
+        interval=1,
+        func=lambda: assert_tenant_status(http_client, tenant_id, "Active"),
+    )
+
+    env.neon_cli.config_tenant(tenant_id, {"gc_horizon": "1000000"})
+    contents_first = config_path.read_text()
+    env.neon_cli.config_tenant(tenant_id, {"gc_horizon": "0"})
+    contents_later = config_path.read_text()
+
+    # dont test applying the setting here, we have that another test case to show it
+    # we just care about being able to create the file
+    assert len(contents_first) > len(contents_later)
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index db5bb679f2..6c3454b79b 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -6,6 +6,7 @@ from threading import Thread
 import asyncpg
 import pytest
 from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
@@ -59,11 +60,11 @@ def test_tenant_reattach(
     # create new nenant
     tenant_id, timeline_id = env.neon_cli.create_tenant()
 
-    pg = env.postgres.create_start("main", tenant_id=tenant_id)
-    with pg.cursor() as cur:
-        cur.execute("CREATE TABLE t(key int primary key, value text)")
-        cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
-        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE t(key int primary key, value text)")
+            cur.execute("INSERT INTO t SELECT generate_series(1,100000), 'payload'")
+            current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
 
     # Wait for the all data to be processed by the pageserver and uploaded in remote storage
     wait_for_last_record_lsn(pageserver_http, tenant_id, timeline_id, current_lsn)
@@ -78,15 +79,34 @@ def test_tenant_reattach(
         ".*failed to perform remote task UploadMetadata.*, will retry.*"
     )
 
+    ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver")
+    tenant_metric_filter = {
+        "tenant_id": str(tenant_id),
+        "timeline_id": str(timeline_id),
+    }
+    pageserver_last_record_lsn_before_detach = int(
+        ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
+    )
+
     pageserver_http.tenant_detach(tenant_id)
     pageserver_http.tenant_attach(tenant_id)
 
-    with pg.cursor() as cur:
-        assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
+    time.sleep(1)  # for metrics propagation
 
-    # Check that we had to retry the downloads
-    assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*")
-    assert env.pageserver.log_contains(".*download.*failed, will retry.*")
+    ps_metrics = parse_metrics(pageserver_http.get_metrics(), "pageserver")
+    pageserver_last_record_lsn = int(
+        ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
+    )
+
+    assert pageserver_last_record_lsn_before_detach == pageserver_last_record_lsn
+
+    with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
+        with pg.cursor() as cur:
+            assert query_scalar(cur, "SELECT count(*) FROM t") == 100000
+
+        # Check that we had to retry the downloads
+        assert env.pageserver.log_contains(".*list prefixes.*failed, will retry.*")
+        assert env.pageserver.log_contains(".*download.*failed, will retry.*")
 
 
 num_connections = 10
@@ -237,7 +257,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()
 
-    env.pageserver.allowed_errors.append(".*NotFound\\(Tenant .* not found")
+    env.pageserver.allowed_errors.append(".*NotFound: Tenant .* not found")
 
     # first check for non existing tenant
     tenant_id = TenantId.generate()
@@ -272,8 +292,10 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
         bogus_timeline_id = TimelineId.generate()
         pageserver_http.timeline_gc(tenant_id, bogus_timeline_id, 0)
 
-        # the error will be printed to the log too
+    # the error will be printed to the log too
     env.pageserver.allowed_errors.append(".*gc target timeline does not exist.*")
+    # Timelines get stopped during detach, ignore the gc calls that error, whitnessing that
+    env.pageserver.allowed_errors.append(".*InternalServerError\\(timeline is Stopping.*")
 
     # Detach while running manual GC.
     # It should wait for manual GC to finish because it runs in a task associated with the tenant.
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 9477ae3c25..e56bb1b469 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -1,5 +1,6 @@
 import os
 import shutil
+import time
 from contextlib import closing
 from datetime import datetime
 from pathlib import Path
@@ -8,6 +9,7 @@ from typing import List
 import pytest
 from fixtures.log_helper import log
 from fixtures.metrics import (
+    PAGESERVER_GLOBAL_METRICS,
     PAGESERVER_PER_TENANT_METRICS,
     PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     parse_metrics,
@@ -160,6 +162,14 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
             f"process_start_time_seconds (UTC): {datetime.fromtimestamp(metrics.query_one('process_start_time_seconds').value)}"
         )
 
+    # Test (a subset of) pageserver global metrics
+    for metric in PAGESERVER_GLOBAL_METRICS:
+        ps_samples = ps_metrics.query_all(metric, {})
+        assert len(ps_samples) > 0
+        for sample in ps_samples:
+            labels = ",".join([f'{key}="{value}"' for key, value in sample.labels.items()])
+            log.info(f"{sample.name}{{{labels}}} {sample.value}")
+
 
 @pytest.mark.parametrize(
     "remote_storage_kind",
@@ -259,7 +269,7 @@ def test_pageserver_with_empty_tenants(
         files_in_timelines_dir == 0
     ), f"Tenant {tenant_with_empty_timelines_dir} should have an empty timelines/ directory"
 
-    # Trigger timeline reinitialization after pageserver restart
+    # Trigger timeline re-initialization after pageserver restart
     env.postgres.stop_all()
     env.pageserver.stop()
 
@@ -278,7 +288,51 @@ def test_pageserver_with_empty_tenants(
         broken_tenant["state"] == "Broken"
     ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
 
+    broken_tenant_status = client.tenant_status(tenant_without_timelines_dir)
+    assert (
+        broken_tenant_status["state"] == "Broken"
+    ), f"Tenant {tenant_without_timelines_dir} without timelines dir should be broken"
+
+    assert env.pageserver.log_contains(".*Setting tenant as Broken state, reason:.*")
+
     [loaded_tenant] = [t for t in tenants if t["id"] == str(tenant_with_empty_timelines_dir)]
     assert (
         loaded_tenant["state"] == "Active"
     ), "Tenant {tenant_with_empty_timelines_dir} with empty timelines dir should be active and ready for timeline creation"
+
+    loaded_tenant_status = client.tenant_status(tenant_with_empty_timelines_dir)
+    assert (
+        loaded_tenant_status["state"] == "Active"
+    ), f"Tenant {tenant_with_empty_timelines_dir} without timelines dir should be active"
+
+    time.sleep(1)  # to allow metrics propagation
+
+    ps_metrics = parse_metrics(client.get_metrics(), "pageserver")
+    broken_tenants_metric_filter = {
+        "tenant_id": str(tenant_without_timelines_dir),
+        "state": "broken",
+    }
+    active_tenants_metric_filter = {
+        "tenant_id": str(tenant_with_empty_timelines_dir),
+        "state": "active",
+    }
+
+    tenant_active_count = int(
+        ps_metrics.query_one(
+            "pageserver_tenant_states_count", filter=active_tenants_metric_filter
+        ).value
+    )
+
+    assert (
+        tenant_active_count == 1
+    ), f"Tenant {tenant_with_empty_timelines_dir} should have metric as active"
+
+    tenant_broken_count = int(
+        ps_metrics.query_one(
+            "pageserver_tenant_states_count", filter=broken_tenants_metric_filter
+        ).value
+    )
+
+    assert (
+        tenant_broken_count == 1
+    ), f"Tenant {tenant_without_timelines_dir} should have metric as broken"
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index f4b71ae9b7..3a852b2207 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -20,7 +20,9 @@ clap = { version = "4", features = ["derive", "string"] }
 crossbeam-utils = { version = "0.8" }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
+futures = { version = "0.3" }
 futures-channel = { version = "0.3", features = ["sink"] }
+futures-executor = { version = "0.3" }
 futures-task = { version = "0.3", default-features = false, features = ["std"] }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
@@ -31,17 +33,21 @@ memchr = { version = "2" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
 num-integer = { version = "0.1", features = ["i128"] }
-num-traits = { version = "0.2", features = ["i128", "libm"] }
+num-traits = { version = "0.2", features = ["i128"] }
 prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
 regex-syntax = { version = "0.6" }
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
+ring = { version = "0.16", features = ["std"] }
+rustls = { version = "0.20", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "sync", "time"] }
 tokio-util = { version = "0.7", features = ["codec", "io"] }
+tonic = { version = "0.8", features = ["tls-roots"] }
 tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }

From a41b5244a84116c16ace9143a02f8d21d218a84c Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 20 Feb 2023 18:22:49 +0300
Subject: [PATCH 036/209] Add new safekeeper to ap-southeast-1 prod (#3645)
 (#3646)

To trigger deployment of #3645 to production.
---
 .github/ansible/prod.ap-southeast-1.hosts.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml
index 7c6d1db6d7..71fced23c2 100644
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -36,3 +36,5 @@ storage:
           ansible_host:  i-0e338adda8eb2d19f
         safekeeper-2.ap-southeast-1.aws.neon.tech:
           ansible_host:  i-04fb63634e4679eb9
+        safekeeper-3.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-05481f3bc88cfc2d4

From 78aca668d08190934708f864fbddd14da316e8a8 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 21 Feb 2023 19:31:53 +0200
Subject: [PATCH 037/209] fix: log download failed error (#3661)

Fixes #3659
---
 pageserver/src/tenant/tasks.rs      | 2 +-
 pageserver/src/tenant/timeline.rs   | 1 +
 test_runner/regress/test_tenants.py | 4 ++++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index e9ce52d1ab..20d1d2bfb6 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -74,7 +74,7 @@ async fn compaction_loop(tenant_id: TenantId) {
             let period = tenant.get_compaction_period();
 
             // TODO: we shouldn't need to await to find tenant and this could be moved outside of
-            // loop
+            // loop, #3501. There are also additional "allowed_errors" in tests.
             if first {
                 first = false;
                 if random_init_delay(period, &cancel).await.is_err() {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f2b0a98509..8bc02cd10a 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3745,6 +3745,7 @@ impl Timeline {
                     remote_layer.ongoing_download.close();
                 } else {
                     // Keep semaphore open. We'll drop the permit at the end of the function.
+                    info!("on-demand download failed: {:?}", result.as_ref().unwrap_err());
                 }
 
                 // Don't treat it as an error if the task that triggered the download
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index e56bb1b469..9e75396799 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -250,6 +250,10 @@ def test_pageserver_with_empty_tenants(
     env.pageserver.allowed_errors.append(
         ".*could not load tenant.*Failed to list timelines directory.*"
     )
+    # this is until #3501
+    env.pageserver.allowed_errors.append(
+        ".*Compaction failed, retrying in 2s: Cannot run compaction iteration on inactive tenant"
+    )
 
     client = env.pageserver.http_client()
 

From 15273a9b669c75b8a5bcda186715e7c1a940bfd6 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 21 Feb 2023 20:20:13 +0200
Subject: [PATCH 038/209] chore: ignore all compaction inactive tenant errors
 (#3665)

these are happening in tests because of #3655 but they sure took some
time to appear.

makes the `Compaction failed, retrying in 2s: Cannot run compaction
iteration on inactive tenant` into a globally allowed error, because it
has been seen failing on different test cases.
---
 test_runner/fixtures/neon_fixtures.py | 2 ++
 test_runner/regress/test_tenants.py   | 4 ----
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 63196609cc..73f224039e 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2080,6 +2080,8 @@ class NeonPageserver(PgProtocol):
             ".*query handler for 'pagestream.*failed: Timeline .* was not found",  # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
             ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
             ".*task iteration took longer than the configured period.*",
+            # this is until #3501
+            ".*Compaction failed, retrying in [^:]+: Cannot run compaction iteration on inactive tenant",
         ]
 
     def start(
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 9e75396799..e56bb1b469 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -250,10 +250,6 @@ def test_pageserver_with_empty_tenants(
     env.pageserver.allowed_errors.append(
         ".*could not load tenant.*Failed to list timelines directory.*"
     )
-    # this is until #3501
-    env.pageserver.allowed_errors.append(
-        ".*Compaction failed, retrying in 2s: Cannot run compaction iteration on inactive tenant"
-    )
 
     client = env.pageserver.http_client()
 

From 43bf6d0a0f2695074f360bbc3b1deea2065f6321 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 21 Feb 2023 21:09:31 +0200
Subject: [PATCH 039/209] calculate_logical_size: no longer use spawn_blocking
 (#3664)

Calculation of logical size is now async because of layer downloads, so
we shouldn't use spawn_blocking for it. Use of `spawn_blocking`
exhausted resources which are needed by `tokio::io::copy` when copying
from a stream to a file which lead to deadlock.

Fixes: #3657
---
 pageserver/src/tenant/timeline.rs | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8bc02cd10a..176eb61ff3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1770,15 +1770,9 @@ impl Timeline {
         let calculation = async {
             let cancel = cancel.child_token();
             let ctx = ctx.attached_child();
-            tokio::task::spawn_blocking(move || {
-                // Run in a separate thread since this can do a lot of
-                // synchronous file IO without .await inbetween
-                // if there are no RemoteLayers that would require downloading.
-                let h = tokio::runtime::Handle::current();
-                h.block_on(self_calculation.calculate_logical_size(init_lsn, cancel, &ctx))
-            })
-            .await
-            .context("Failed to spawn calculation result task")?
+            self_calculation
+                .calculate_logical_size(init_lsn, cancel, &ctx)
+                .await
         };
         let timeline_state_cancellation = async {
             loop {
@@ -1811,7 +1805,7 @@ impl Timeline {
         tokio::pin!(calculation);
         loop {
             tokio::select! {
-                res = &mut calculation =>  { return res }
+                res = &mut calculation => { return res }
                 reason = timeline_state_cancellation => {
                     debug!(reason = reason, "cancelling calculation");
                     cancel.cancel();

From a51b269f1502b69c5b2720d0a0e20b1e702ebd58 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 21 Feb 2023 21:14:08 +0200
Subject: [PATCH 040/209] fix: hold permit until GetObject eof (#3663)

previously we applied the ratelimiting only up to receiving the headers
from s3, or somewhere near it. the commit adds an adapter which carries
the permit until the AsyncRead has been disposed.

fixes #3662.
---
 Cargo.lock                           |  1 +
 libs/remote_storage/Cargo.toml       |  2 +-
 libs/remote_storage/src/s3_bucket.rs | 45 +++++++++++++++++++++++-----
 3 files changed, 40 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d154b4eaea..dab3d12263 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3054,6 +3054,7 @@ dependencies = [
  "hyper",
  "metrics",
  "once_cell",
+ "pin-project-lite",
  "serde",
  "serde_json",
  "tempfile",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 4382fbac32..15812e8439 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -21,7 +21,7 @@ toml_edit.workspace = true
 tracing.workspace = true
 metrics.workspace = true
 utils.workspace = true
-
+pin-project-lite.workspace = true
 workspace_hack.workspace = true
 
 [dev-dependencies]
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 18a2c5dedd..93f5e0596e 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -20,7 +20,10 @@ use aws_sdk_s3::{
 };
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
-use tokio::{io, sync::Semaphore};
+use tokio::{
+    io::{self, AsyncRead},
+    sync::Semaphore,
+};
 use tokio_util::io::ReaderStream;
 use tracing::debug;
 
@@ -102,7 +105,7 @@ pub struct S3Bucket {
     // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
     // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
     // The helps to ensure we don't exceed the thresholds.
-    concurrency_limiter: Semaphore,
+    concurrency_limiter: Arc<Semaphore>,
 }
 
 #[derive(Default)]
@@ -162,7 +165,7 @@ impl S3Bucket {
             client,
             bucket_name: aws_config.bucket_name.clone(),
             prefix_in_bucket,
-            concurrency_limiter: Semaphore::new(aws_config.concurrency_limit.get()),
+            concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
         })
     }
 
@@ -194,9 +197,10 @@ impl S3Bucket {
     }
 
     async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
-        let _guard = self
+        let permit = self
             .concurrency_limiter
-            .acquire()
+            .clone()
+            .acquire_owned()
             .await
             .context("Concurrency limiter semaphore got closed during S3 download")
             .map_err(DownloadError::Other)?;
@@ -217,9 +221,10 @@ impl S3Bucket {
                 let metadata = object_output.metadata().cloned().map(StorageMetadata);
                 Ok(Download {
                     metadata,
-                    download_stream: Box::pin(io::BufReader::new(
+                    download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new(
+                        permit,
                         object_output.body.into_async_read(),
-                    )),
+                    ))),
                 })
             }
             Err(SdkError::ServiceError {
@@ -240,6 +245,32 @@ impl S3Bucket {
     }
 }
 
+pin_project_lite::pin_project! {
+    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
+    struct RatelimitedAsyncRead<S> {
+        permit: tokio::sync::OwnedSemaphorePermit,
+        #[pin]
+        inner: S,
+    }
+}
+
+impl<S: AsyncRead> RatelimitedAsyncRead<S> {
+    fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
+        RatelimitedAsyncRead { permit, inner }
+    }
+}
+
+impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
+    fn poll_read(
+        self: std::pin::Pin<&mut Self>,
+        cx: &mut std::task::Context<'_>,
+        buf: &mut io::ReadBuf<'_>,
+    ) -> std::task::Poll<std::io::Result<()>> {
+        let this = self.project();
+        this.inner.poll_read(cx, buf)
+    }
+}
+
 #[async_trait::async_trait]
 impl RemoteStorage for S3Bucket {
     async fn list(&self) -> anyhow::Result<Vec<RemotePath>> {

From 38cd90dd0c85ff6286e689cae33a95fe095ffca0 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Tue, 21 Feb 2023 21:11:52 +0100
Subject: [PATCH 041/209] Add -v to ansible invocations (#3670)

To get more debug output on failures
---
 .github/workflows/deploy-dev.yml  | 2 +-
 .github/workflows/deploy-prod.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/deploy-dev.yml b/.github/workflows/deploy-dev.yml
index 409517bf63..b080a29f7c 100644
--- a/.github/workflows/deploy-dev.yml
+++ b/.github/workflows/deploy-dev.yml
@@ -67,7 +67,7 @@ jobs:
           ./get_binaries.sh
 
           ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          ansible-playbook -v deploy.yaml -i staging.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_STAGING_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
           rm -f neon_install.tar.gz .neon_current_version
 
       - name: Cleanup ansible folder
diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml
index 540d187274..6096ac8ab9 100644
--- a/.github/workflows/deploy-prod.yml
+++ b/.github/workflows/deploy-prod.yml
@@ -68,7 +68,7 @@ jobs:
           ./get_binaries.sh
 
           ansible-galaxy collection install sivel.toiletwater
-          ansible-playbook deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
+          ansible-playbook -v deploy.yaml -i prod.${{ matrix.target_region }}.hosts.yaml -e @ssm_config -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
           rm -f neon_install.tar.gz .neon_current_version
 
   deploy-proxy-prod-new:

From 46cc8b7982b270796d6266861801b2e466319968 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Wed, 22 Feb 2023 12:55:41 +0300
Subject: [PATCH 042/209] Remove safekeeper-1.ap-southeast-1.aws.neon.tech
 (#3671)

We migrated all timelines to
`safekeeper-3.ap-southeast-1.aws.neon.tech`, now old instance can be
removed.
---
 .github/ansible/prod.ap-southeast-1.hosts.yaml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml
index 13b44f4052..8ccb67b04a 100644
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -32,8 +32,6 @@ storage:
       hosts:
         safekeeper-0.ap-southeast-1.aws.neon.tech:
           ansible_host:  i-0d6f1dc5161eef894
-        safekeeper-1.ap-southeast-1.aws.neon.tech:
-          ansible_host:  i-0e338adda8eb2d19f
         safekeeper-2.ap-southeast-1.aws.neon.tech:
           ansible_host:  i-04fb63634e4679eb9
         safekeeper-3.ap-southeast-1.aws.neon.tech:

From efef68ce9972ba47ce7890e386d1b066e67f695d Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 22 Feb 2023 22:52:22 +0400
Subject: [PATCH 043/209] Bump vendor/postgres to include hotfix for unlogged
 tables with indexes.

https://github.com/neondatabase/postgres/pull/259
https://github.com/neondatabase/postgres/pull/262
---
 vendor/postgres-v14 | 2 +-
 vendor/postgres-v15 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index f210ac524b..b44ee1d9a5 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit f210ac524b42d2d6f404f8505c64de36e977d17c
+Subproject commit b44ee1d9a5b061ababb31f89a4e30a1795573f51
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 33f9763454..303fa4050f 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 33f976345490351f951d72f81621c2263c186c9a
+Subproject commit 303fa4050fafba3771052b3d49b8e2d00d6ea2e3

From 91a4ea0de2a69b6003c6d224b6c96789ce1a4513 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Thu, 23 Feb 2023 15:10:22 +0100
Subject: [PATCH 044/209] Update vendored PostgreSQL versions to 14.7 and 15.2
 (#3581)

## Describe your changes
Rebase vendored PostgreSQL onto 14.7 and 15.2

## Issue ticket number and link

#3579

## Checklist before requesting a review
- [x] I have performed a self-review of my code.
- [x] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [x] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
    ```
The version of PostgreSQL that we use is updated to 14.7 for PostgreSQL
14 and 15.2 for PostgreSQL 15.
    ```
---
 test_runner/fixtures/neon_fixtures.py   |  10 +-
 test_runner/regress/test_tenant_size.py | 164 ++++++++++++++++--------
 vendor/postgres-v14                     |   2 +-
 vendor/postgres-v15                     |   2 +-
 4 files changed, 120 insertions(+), 58 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 73f224039e..c4b3d057f8 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1669,7 +1669,7 @@ class AbstractNeonCli(abc.ABC):
             timeout=timeout,
         )
         if not res.returncode:
-            log.info(f"Run success: {res.stdout}")
+            log.info(f"Run {res.args} success: {res.stdout}")
         elif check_return_code:
             # this way command output will be in recorded and shown in CI in failure message
             msg = f"""\
@@ -3463,6 +3463,14 @@ def wait_for_last_flush_lsn(
     return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn)
 
 
+def wait_for_wal_insert_lsn(
+    env: NeonEnv, pg: Postgres, tenant: TenantId, timeline: TimelineId
+) -> Lsn:
+    """Wait for pageserver to catch up the latest flush LSN, returns the last observed lsn."""
+    last_flush_lsn = Lsn(pg.safe_psql("SELECT pg_current_wal_insert_lsn()")[0][0])
+    return wait_for_last_record_lsn(env.pageserver.http_client(), tenant, timeline, last_flush_lsn)
+
+
 def fork_at_current_lsn(
     env: NeonEnv,
     pg: Postgres,
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 8c2996f491..a4b5f7739a 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -3,8 +3,15 @@ from typing import List, Tuple
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn
-from fixtures.types import Lsn
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PageserverHttpClient,
+    Postgres,
+    wait_for_last_flush_lsn,
+    wait_for_wal_insert_lsn,
+)
+from fixtures.types import Lsn, TenantId, TimelineId
 
 
 def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
@@ -324,7 +331,7 @@ def test_single_branch_get_tenant_size_grows(
     # inserts is larger than gc_horizon. for example 0x20000 here hid the fact
     # that there next_gc_cutoff could be smaller than initdb_lsn, which will
     # obviously lead to issues when calculating the size.
-    gc_horizon = 0x30000
+    gc_horizon = 0x38000
     neon_env_builder.pageserver_config_override = f"tenant_config={{compaction_period='0s', gc_period='0s', pitr_interval='0sec', gc_horizon={gc_horizon}}}"
 
     env = neon_env_builder.init_start()
@@ -334,29 +341,75 @@ def test_single_branch_get_tenant_size_grows(
 
     http_client = env.pageserver.http_client()
 
-    collected_responses: List[Tuple[Lsn, int]] = []
+    collected_responses: List[Tuple[str, Lsn, int]] = []
 
     size_debug_file = open(test_output_dir / "size_debug.html", "w")
 
-    def check_size_change(current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev: int):
-        if current_lsn - initdb_lsn > gc_horizon:
+    def check_size_change(
+        current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev_size: int
+    ):
+        if current_lsn - initdb_lsn >= gc_horizon:
             assert (
-                size >= prev
+                size >= prev_size
             ), "tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size"
         else:
             assert (
-                size > prev
+                size > prev_size
             ), "tenant_size should grow, because we continue to add WAL to initial snapshot size"
 
-    with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg:
-        initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    def get_current_consistent_size(
+        env: NeonEnv,
+        pg: Postgres,
+        size_debug_file,  # apparently there is no public signature for open()...
+        http_client: PageserverHttpClient,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Tuple[Lsn, int]:
+        consistent = False
+        size_debug = None
+
+        current_lsn = wait_for_wal_insert_lsn(env, pg, tenant_id, timeline_id)
+        # We want to make sure we have a self-consistent set of values.
+        # Size changes with WAL, so only if both before and after getting
+        # the size of the tenant reports the same WAL insert LSN, we're OK
+        # to use that (size, LSN) combination.
+        # Note that 'wait_for_wal_flush_lsn' is not accurate enough: There
+        # can be more wal after the flush LSN that can arrive on the
+        # pageserver before we're requesting the page size.
+        # Anyway, in general this is only one iteration, so in general
+        # this is fine.
+        while not consistent:
+            size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
+            size_debug = http_client.tenant_size_debug(tenant_id)
+
+            after_lsn = wait_for_wal_insert_lsn(env, pg, tenant_id, timeline_id)
+            consistent = current_lsn == after_lsn
+            current_lsn = after_lsn
+        size_debug_file.write(size_debug)
+        return (current_lsn, size)
+
+    with env.postgres.create_start(
+        branch_name,
+        tenant_id=tenant_id,
+        ### autovacuum is disabled to limit WAL logging.
+        config_lines=["autovacuum=off"],
+    ) as pg:
+        (initdb_lsn, size) = get_current_consistent_size(
+            env, pg, size_debug_file, http_client, tenant_id, timeline_id
+        )
+        collected_responses.append(("INITDB", initdb_lsn, size))
+
         with pg.cursor() as cur:
-            cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL)")
+            cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL) WITH (fillfactor = 40)")
+
+        (current_lsn, size) = get_current_consistent_size(
+            env, pg, size_debug_file, http_client, tenant_id, timeline_id
+        )
+        collected_responses.append(("CREATE", current_lsn, size))
 
         batch_size = 100
 
-        i = 0
-        while True:
+        for i in range(3):
             with pg.cursor() as cur:
                 cur.execute(
                     f"INSERT INTO t0(i) SELECT i FROM generate_series({batch_size} * %s, ({batch_size} * (%s + 1)) - 1) s(i)",
@@ -365,27 +418,24 @@ def test_single_branch_get_tenant_size_grows(
 
             i += 1
 
-            current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+            (current_lsn, size) = get_current_consistent_size(
+                env, pg, size_debug_file, http_client, tenant_id, timeline_id
+            )
 
-            size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
+            prev_size = collected_responses[-1][2]
+            if size == 0:
+                assert prev_size == 0
+            else:
+                # branch start shouldn't be past gc_horizon yet
+                # thus the size should grow as we insert more data
+                # "gc_horizon" is tuned so that it kicks in _after_ the
+                # insert phase, but before the update phase ends.
+                assert (
+                    current_lsn - initdb_lsn <= gc_horizon
+                ), "Tuning of GC window is likely out-of-date"
+                assert size > prev_size
 
-            size_debug = http_client.tenant_size_debug(tenant_id)
-            size_debug_file.write(size_debug)
-
-            if len(collected_responses) > 0:
-                prev = collected_responses[-1][1]
-                if size == 0:
-                    assert prev == 0
-                else:
-                    # branch start shouldn't be past gc_horizon yet
-                    # thus the size should grow as we insert more data
-                    assert current_lsn - initdb_lsn <= gc_horizon
-                    assert size > prev
-
-            collected_responses.append((current_lsn, size))
-
-            if len(collected_responses) > 2:
-                break
+            collected_responses.append(("INSERT", current_lsn, size))
 
         while True:
             with pg.cursor() as cur:
@@ -397,18 +447,15 @@ def test_single_branch_get_tenant_size_grows(
             if updated == 0:
                 break
 
-            current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+            (current_lsn, size) = get_current_consistent_size(
+                env, pg, size_debug_file, http_client, tenant_id, timeline_id
+            )
 
-            size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
+            prev_size = collected_responses[-1][2]
 
-            size_debug = http_client.tenant_size_debug(tenant_id)
-            size_debug_file.write(size_debug)
+            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
 
-            prev = collected_responses[-1][1]
-
-            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
-
-            collected_responses.append((current_lsn, size))
+            collected_responses.append(("UPDATE", current_lsn, size))
 
         while True:
             with pg.cursor() as cur:
@@ -418,40 +465,47 @@ def test_single_branch_get_tenant_size_grows(
             if deleted == 0:
                 break
 
-            current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+            (current_lsn, size) = get_current_consistent_size(
+                env, pg, size_debug_file, http_client, tenant_id, timeline_id
+            )
 
-            size = http_client.tenant_size(tenant_id)
-            prev = collected_responses[-1][1]
+            prev_size = collected_responses[-1][2]
 
-            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
+            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
 
-            collected_responses.append((current_lsn, size))
+            collected_responses.append(("DELETE", current_lsn, size))
 
         with pg.cursor() as cur:
             cur.execute("DROP TABLE t0")
 
-        current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+        # The size of the tenant should still be as large as before we dropped
+        # the table, because the drop operation can still be undone in the PITR
+        # defined by gc_horizon.
+        (current_lsn, size) = get_current_consistent_size(
+            env, pg, size_debug_file, http_client, tenant_id, timeline_id
+        )
 
-        size = http_client.tenant_size(tenant_id)
-        prev = collected_responses[-1][1]
+        prev_size = collected_responses[-1][2]
 
-        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
+        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev_size)
 
-        collected_responses.append((current_lsn, size))
+        collected_responses.append(("DROP", current_lsn, size))
 
     # this isn't too many lines to forget for a while. observed while
     # developing these tests that locally the value is a bit more than what we
     # get in the ci.
-    for lsn, size in collected_responses:
-        log.info(f"collected: {lsn}, {size}")
+    for phase, lsn, size in collected_responses:
+        log.info(f"collected: {phase}, {lsn}, {size}")
 
     env.pageserver.stop()
     env.pageserver.start()
 
+    size_after = http_client.tenant_size(tenant_id)
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
     size_debug_file.close()
 
-    size_after = http_client.tenant_size(tenant_id)
-    prev = collected_responses[-1][1]
+    prev = collected_responses[-1][2]
 
     assert size_after == prev, "size after restarting pageserver should not have changed"
 
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index b44ee1d9a5..468d3c0824 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit b44ee1d9a5b061ababb31f89a4e30a1795573f51
+Subproject commit 468d3c08245906f083fed1009759f9f953f5915d
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 303fa4050f..9a2093383a 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 303fa4050fafba3771052b3d49b8e2d00d6ea2e3
+Subproject commit 9a2093383ae19906f025b008ceecf89ebc9ea869

From 240913912a4d9450ac3c67810b60f1df7e65afb4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 24 Feb 2023 13:45:32 +0200
Subject: [PATCH 045/209] Fix UNLOGGED tables.

Instead of trying to create missing files on the way, send init fork contents as
main fork from pageserver during basebackup. Add test for that. Call
put_rel_drop for init forks; previously they weren't removed. Bump
vendor/postgres to revert previous approach on Postgres side.

Co-authored-by: Arseny Sher <sher-ars@yandex.ru>

ref https://github.com/neondatabase/postgres/pull/264
ref https://github.com/neondatabase/postgres/pull/259
ref https://github.com/neondatabase/neon/issues/1222
---
 libs/pageserver_api/src/reltag.rs    |  9 ++++++
 pageserver/src/basebackup.rs         | 45 ++++++++++++++++++++--------
 pageserver/src/walingest.rs          |  4 +--
 test_runner/regress/test_unlogged.py | 34 +++++++++++++++++++++
 vendor/postgres-v14                  |  2 +-
 vendor/postgres-v15                  |  2 +-
 6 files changed, 79 insertions(+), 17 deletions(-)
 create mode 100644 test_runner/regress/test_unlogged.py

diff --git a/libs/pageserver_api/src/reltag.rs b/libs/pageserver_api/src/reltag.rs
index 43d38bd986..12693379f5 100644
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -98,6 +98,15 @@ impl RelTag {
 
         name
     }
+
+    pub fn with_forknum(&self, forknum: u8) -> Self {
+        RelTag {
+            forknum,
+            spcnode: self.spcnode,
+            dbnode: self.dbnode,
+            relnode: self.relnode,
+        }
+    }
 }
 
 ///
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 06d4853274..41fa0a67bb 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -33,6 +33,7 @@ use pageserver_api::reltag::{RelTag, SlruKind};
 
 use postgres_ffi::pg_constants::{DEFAULTTABLESPACE_OID, GLOBALTABLESPACE_OID};
 use postgres_ffi::pg_constants::{PGDATA_SPECIAL_FILES, PGDATA_SUBDIRS, PG_HBA};
+use postgres_ffi::relfile_utils::{INIT_FORKNUM, MAIN_FORKNUM};
 use postgres_ffi::TransactionId;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::PG_TLI;
@@ -190,14 +191,31 @@ where
         {
             self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
 
-            // Gather and send relational files in each database if full backup is requested.
-            if self.full_backup {
-                for rel in self
-                    .timeline
-                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
-                    .await?
-                {
-                    self.add_rel(rel).await?;
+            // If full backup is requested, include all relation files.
+            // Otherwise only include init forks of unlogged relations.
+            let rels = self
+                .timeline
+                .list_rels(spcnode, dbnode, self.lsn, self.ctx)
+                .await?;
+            for &rel in rels.iter() {
+                // Send init fork as main fork to provide well formed empty
+                // contents of UNLOGGED relations. Postgres copies it in
+                // `reinit.c` during recovery.
+                if rel.forknum == INIT_FORKNUM {
+                    // I doubt we need _init fork itself, but having it at least
+                    // serves as a marker relation is unlogged.
+                    self.add_rel(rel, rel).await?;
+                    self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
+                    continue;
+                }
+
+                if self.full_backup {
+                    if rel.forknum == MAIN_FORKNUM && rels.contains(&rel.with_forknum(INIT_FORKNUM))
+                    {
+                        // skip this, will include it when we reach the init fork
+                        continue;
+                    }
+                    self.add_rel(rel, rel).await?;
                 }
             }
         }
@@ -220,15 +238,16 @@ where
         Ok(())
     }
 
-    async fn add_rel(&mut self, tag: RelTag) -> anyhow::Result<()> {
+    /// Add contents of relfilenode `src`, naming it as `dst`.
+    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
         let nblocks = self
             .timeline
-            .get_rel_size(tag, self.lsn, false, self.ctx)
+            .get_rel_size(src, self.lsn, false, self.ctx)
             .await?;
 
         // If the relation is empty, create an empty file
         if nblocks == 0 {
-            let file_name = tag.to_segfile_name(0);
+            let file_name = dst.to_segfile_name(0);
             let header = new_tar_header(&file_name, 0)?;
             self.ar.append(&header, &mut io::empty()).await?;
             return Ok(());
@@ -244,12 +263,12 @@ where
             for blknum in startblk..endblk {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(tag, blknum, self.lsn, false, self.ctx)
+                    .get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
                     .await?;
                 segment_data.extend_from_slice(&img[..]);
             }
 
-            let file_name = tag.to_segfile_name(seg as u32);
+            let file_name = dst.to_segfile_name(seg as u32);
             let header = new_tar_header(&file_name, segment_data.len() as u64)?;
             self.ar.append(&header, segment_data.as_slice()).await?;
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 3761c65668..63d568a342 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -37,7 +37,7 @@ use crate::walrecord::*;
 use crate::ZERO_PAGE;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
-use postgres_ffi::relfile_utils::{FSM_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
+use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::v14::nonrelfile_utils::mx_offset_to_member_segment;
 use postgres_ffi::v14::xlog_utils::*;
 use postgres_ffi::v14::CheckPoint;
@@ -762,7 +762,7 @@ impl<'a> WalIngest<'a> {
         )?;
 
         for xnode in &parsed.xnodes {
-            for forknum in MAIN_FORKNUM..=VISIBILITYMAP_FORKNUM {
+            for forknum in MAIN_FORKNUM..=INIT_FORKNUM {
                 let rel = RelTag {
                     forknum,
                     spcnode: xnode.spcnode,
diff --git a/test_runner/regress/test_unlogged.py b/test_runner/regress/test_unlogged.py
new file mode 100644
index 0000000000..b6b20f1230
--- /dev/null
+++ b/test_runner/regress/test_unlogged.py
@@ -0,0 +1,34 @@
+from fixtures.neon_fixtures import NeonEnv, fork_at_current_lsn
+
+
+#
+# Test UNLOGGED tables/relations. Postgres copies init fork contents to main
+# fork to reset them during recovery. In Neon, pageserver directly sends init
+# fork contents as main fork during basebackup.
+#
+def test_unlogged(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_unlogged", "empty")
+    pg = env.postgres.create_start("test_unlogged")
+
+    conn = pg.connect()
+    cur = conn.cursor()
+
+    cur.execute("CREATE UNLOGGED TABLE iut (id int);")
+    # create index to test unlogged index relation as well
+    cur.execute("CREATE UNIQUE INDEX iut_idx ON iut (id);")
+    cur.execute("INSERT INTO iut values (42);")
+
+    # create another compute to fetch inital empty contents from pageserver
+    fork_at_current_lsn(env, pg, "test_unlogged_basebackup", "test_unlogged")
+    pg2 = env.postgres.create_start(
+        "test_unlogged_basebackup",
+    )
+
+    conn2 = pg2.connect()
+    cur2 = conn2.cursor()
+    # after restart table should be empty but valid
+    cur2.execute("PREPARE iut_plan (int) AS INSERT INTO iut VALUES ($1)")
+    cur2.execute("EXECUTE iut_plan (43);")
+    cur2.execute("SELECT * FROM iut")
+    assert cur2.fetchall() == [(43,)]
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 468d3c0824..5fb2e0bba0 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 468d3c08245906f083fed1009759f9f953f5915d
+Subproject commit 5fb2e0bba06cc018ee2506f337c91751ab695454
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 9a2093383a..919851e781 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 9a2093383ae19906f025b008ceecf89ebc9ea869
+Subproject commit 919851e7811fcb2ecfc67f35bfd63a35639c73b5

From 99752286d836653e3af3bc798871394970af8db0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lassi=20P=C3=B6l=C3=B6nen?= <lassi.polonen@iki.fi>
Date: Tue, 14 Mar 2023 15:23:46 +0200
Subject: [PATCH 046/209] Use RollingUpdate strategy also for legacy proxy
 (#3814)

## Describe your changes
We have previously changed the neon-proxy to use RollingUpdate. This
should be enabled in legacy proxy too in order to avoid breaking
connections for the clients and allow for example backups to run even
during deployment. (https://github.com/neondatabase/neon/pull/3683)

## Issue ticket number and link
https://github.com/neondatabase/neon/issues/3333
---
 ...od-us-west-2-eta.neon-proxy-scram-legacy.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
index e67a3e4461..d23ea41bd7 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
@@ -1,6 +1,22 @@
 # Helm chart values for neon-proxy-scram.
 # This is a YAML-formatted file.
 
+deploymentStrategy:
+  type: RollingUpdate
+  rollingUpdate:
+    maxSurge: 100%
+    maxUnavailable: 50%
+
+# Delay the kill signal by 7 days (7 * 24 * 60 * 60)
+# The pod(s) will stay in Terminating, keeps the existing connections
+# but doesn't receive new ones
+containerLifecycle:
+  preStop:
+    exec:
+      command: ["/bin/sh", "-c", "sleep 604800"]
+terminationGracePeriodSeconds: 604800
+
+
 image:
   repository: neondatabase/neon
 

From afd0a6b39aa10392a74ab1200fe4765936b75ce0 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 15 Mar 2023 11:44:55 +0400
Subject: [PATCH 047/209] Forward framed read buf contents to compute before
 proxy pass.

Otherwise they get lost. Normally buffer is empty before proxy pass, but this is
not the case with pipeline mode of out npm driver; fixes connection hangup
introduced by b80fe41af3e for it.

fixes https://github.com/neondatabase/neon/issues/3822
---
 libs/pq_proto/src/framed.rs |  6 +++---
 proxy/src/proxy.rs          | 27 +++++++++++++++++++++------
 proxy/src/stream.rs         |  5 +++--
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs
index 972730cbab..3cdca45009 100644
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -63,9 +63,9 @@ impl<S> Framed<S> {
         &self.stream
     }
 
-    /// Extract the underlying stream.
-    pub fn into_inner(self) -> S {
-        self.stream
+    /// Deconstruct into the underlying stream and read buffer.
+    pub fn into_inner(self) -> (S, BytesMut) {
+        (self.stream, self.read_buf)
     }
 
     /// Return new Framed with stream type transformed by async f, for TLS
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index abeff6a33b..efe0e8795b 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -16,7 +16,7 @@ use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCou
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use std::sync::Arc;
-use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tracing::{error, info, warn};
 use utils::measured_stream::MeasuredStream;
 
@@ -209,9 +209,18 @@ async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                     if let Some(tls) = tls.take() {
                         // Upgrade raw stream into a secure TLS-backed stream.
                         // NOTE: We've consumed `tls`; this fact will be used later.
-                        stream = PqStream::new(
-                            stream.into_inner().upgrade(tls.to_server_config()).await?,
-                        );
+
+                        let (raw, read_buf) = stream.into_inner();
+                        // TODO: Normally, client doesn't send any data before
+                        // server says TLS handshake is ok and read_buf is empy.
+                        // However, you could imagine pipelining of postgres
+                        // SSLRequest + TLS ClientHello in one hunk similar to
+                        // pipelining in our node js driver. We should probably
+                        // support that by chaining read_buf with the stream.
+                        if !read_buf.is_empty() {
+                            bail!("data is sent before server replied with EncryptionResponse");
+                        }
+                        stream = PqStream::new(raw.upgrade(tls.to_server_config()).await?);
                     }
                 }
                 _ => bail!(ERR_PROTO_VIOLATION),
@@ -443,11 +452,17 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
             value: mut node_info,
         } = auth_result;
 
-        let node = connect_to_compute(&mut node_info, params, &extra, &creds)
+        let mut node = connect_to_compute(&mut node_info, params, &extra, &creds)
             .or_else(|e| stream.throw_error(e))
             .await?;
 
         prepare_client_connection(&node, reported_auth_ok, session, &mut stream).await?;
-        proxy_pass(stream.into_inner(), node.stream, &node_info.aux).await
+        // Before proxy passing, forward to compute whatever data is left in the
+        // PqStream input buffer. Normally there is none, but our serverless npm
+        // driver in pipeline mode sends startup, password and first query
+        // immediately after opening the connection.
+        let (stream, read_buf) = stream.into_inner();
+        node.stream.write_all(&read_buf).await?;
+        proxy_pass(stream, node.stream, &node_info.aux).await
     }
 }
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index 9dfc435e39..7cb292ed58 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -1,5 +1,6 @@
 use crate::error::UserFacingError;
 use anyhow::bail;
+use bytes::BytesMut;
 use pin_project_lite::pin_project;
 use pq_proto::framed::{ConnectionError, Framed};
 use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError};
@@ -27,8 +28,8 @@ impl<S> PqStream<S> {
         }
     }
 
-    /// Extract the underlying stream.
-    pub fn into_inner(self) -> S {
+    /// Extract the underlying stream and read buffer.
+    pub fn into_inner(self) -> (S, BytesMut) {
         self.framed.into_inner()
     }
 

From 4ed51ad33b12edfec2e03b47f8fcfb7e7b8173cb Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Tue, 11 Apr 2023 12:50:10 +0300
Subject: [PATCH 048/209] Add more proxy cnames

---
 .../prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml           | 2 +-
 .../helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml   | 2 +-
 .github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml  | 2 +-
 .github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml    | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
index 36dac8309d..5a98217bae 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -24,7 +24,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.ap-southeast-1.aws.neon.tech"
-  extraDomains: ["*.ap-southeast-1.retooldb.com"]
+  extraDomains: ["*.ap-southeast-1.retooldb.com", "*.ap-southeast-1.postgres.vercel-storage.com"]
   sentryEnvironment: "production"
   wssPort: 8443
   metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
index f5b2f31cb9..a9ee49d82f 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -24,7 +24,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.eu-central-1.aws.neon.tech"
-  extraDomains: ["*.eu-central-1.retooldb.com"]
+  extraDomains: ["*.eu-central-1.retooldb.com", "*.eu-central-1.postgres.vercel-storage.com"]
   sentryEnvironment: "production"
   wssPort: 8443
   metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
index 0be78d868a..239a9911c7 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -24,7 +24,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.us-east-2.aws.neon.tech"
-  extraDomains: ["*.us-east-2.retooldb.com"]
+  extraDomains: ["*.us-east-2.retooldb.com", "*.us-east-2.postgres.vercel-storage.com"]
   sentryEnvironment: "production"
   wssPort: 8443
   metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
index 79115be0e2..c987ae236a 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -24,7 +24,7 @@ settings:
   authBackend: "console"
   authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.us-west-2.aws.neon.tech"
-  extraDomains: ["*.us-west-2.retooldb.com"]
+  extraDomains: ["*.us-west-2.retooldb.com", "*.us-west-2.postgres.vercel-storage.com"]
   sentryEnvironment: "production"
   wssPort: 8443
   metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"

From d11d781afc300b1717050cad38ff02e21aa2dc02 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 28 Apr 2023 17:20:18 +0300
Subject: [PATCH 049/209] revert: "Add check for duplicates of generated image
 layers" (#4104)

This reverts commit 732acc5.

Reverted PR: #3869

As noted in PR #4094, we do in fact try to insert duplicates to the
layer map, if L0->L1 compaction is interrupted. We do not have a proper
fix for that right now, and we are in a hurry to make a release to
production, so revert the changes related to this to the state that we
have in production currently. We know that we have a bug here, but
better to live with the bug that we've had in production for a long
time, than rush a fix to production without testing it in staging first.

Cc: #4094, #4088
---
 pageserver/benches/bench_layer_map.rs         |  4 +--
 pageserver/src/tenant.rs                      |  7 +-----
 pageserver/src/tenant/layer_map.rs            | 23 +++++++----------
 .../layer_map/historic_layer_coverage.rs      |  8 ------
 pageserver/src/tenant/timeline.rs             | 25 +++++++------------
 5 files changed, 21 insertions(+), 46 deletions(-)

diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 8f139a6596..ee5980212e 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -33,7 +33,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
         min_lsn = min(min_lsn, lsn_range.start);
         max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
 
-        updates.insert_historic(Arc::new(layer)).unwrap();
+        updates.insert_historic(Arc::new(layer));
     }
 
     println!("min: {min_lsn}, max: {max_lsn}");
@@ -215,7 +215,7 @@ fn bench_sequential(c: &mut Criterion) {
             is_incremental: false,
             short_id: format!("Layer {}", i),
         };
-        updates.insert_historic(Arc::new(layer)).unwrap();
+        updates.insert_historic(Arc::new(layer));
     }
     updates.flush();
     println!("Finished layer map init in {:?}", now.elapsed());
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d69d5e4b45..5cfc466111 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -271,10 +271,7 @@ impl UninitializedTimeline<'_> {
             .await
             .context("Failed to flush after basebackup import")?;
 
-        // Initialize without loading the layer map. We started with an empty layer map, and already
-        // updated it for the layers that we created during the import.
-        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
-        self.initialize_with_lock(ctx, &mut timelines, false, true)
+        self.initialize(ctx)
     }
 
     fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
@@ -2355,8 +2352,6 @@ impl Tenant {
                 )
             })?;
 
-        // Initialize the timeline without loading the layer map, because we already updated the layer
-        // map above, when we imported the datadir.
         let timeline = {
             let mut timelines = self.timelines.lock().unwrap();
             raw_timeline.initialize_with_lock(ctx, &mut timelines, false, true)?
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 0ee0c6f77d..8d06ccd565 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,7 +51,7 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use crate::tenant::storage_layer::Layer;
-use anyhow::{bail, Result};
+use anyhow::Result;
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
@@ -125,7 +125,7 @@ where
     ///
     /// Insert an on-disk layer.
     ///
-    pub fn insert_historic(&mut self, layer: Arc<L>) -> anyhow::Result<()> {
+    pub fn insert_historic(&mut self, layer: Arc<L>) {
         self.layer_map.insert_historic_noflush(layer)
     }
 
@@ -273,21 +273,16 @@ where
     ///
     /// Helper function for BatchedUpdates::insert_historic
     ///
-    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) -> anyhow::Result<()> {
-        let key = historic_layer_coverage::LayerKey::from(&*layer);
-        if self.historic.contains(&key) {
-            bail!(
-                "Attempt to insert duplicate layer {} in layer map",
-                layer.short_id()
-            );
-        }
-        self.historic.insert(key, Arc::clone(&layer));
+    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
+        // TODO: See #3869, resulting #4088, attempted fix and repro #4094
+        self.historic.insert(
+            historic_layer_coverage::LayerKey::from(&*layer),
+            Arc::clone(&layer),
+        );
 
         if Self::is_l0(&layer) {
             self.l0_delta_layers.push(layer);
         }
-
-        Ok(())
     }
 
     ///
@@ -839,7 +834,7 @@ mod tests {
 
             let expected_in_counts = (1, usize::from(expected_l0));
 
-            map.batch_update().insert_historic(remote.clone()).unwrap();
+            map.batch_update().insert_historic(remote.clone());
             assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
 
             let replaced = map
diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
index 1fdcd5e5a4..b63c361314 100644
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -417,14 +417,6 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
         }
     }
 
-    pub fn contains(&self, layer_key: &LayerKey) -> bool {
-        match self.buffer.get(layer_key) {
-            Some(None) => false,                         // layer remove was buffered
-            Some(_) => true,                             // layer insert was buffered
-            None => self.layers.contains_key(layer_key), // no buffered ops for this layer
-        }
-    }
-
     pub fn insert(&mut self, layer_key: LayerKey, value: Value) {
         self.buffer.insert(layer_key, Some(value));
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5c671ffd63..8768841d87 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1484,7 +1484,7 @@ impl Timeline {
 
                 trace!("found layer {}", layer.path().display());
                 total_physical_size += file_size;
-                updates.insert_historic(Arc::new(layer))?;
+                updates.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
                 // Create a DeltaLayer struct for each delta file.
@@ -1516,7 +1516,7 @@ impl Timeline {
 
                 trace!("found layer {}", layer.path().display());
                 total_physical_size += file_size;
-                updates.insert_historic(Arc::new(layer))?;
+                updates.insert_historic(Arc::new(layer));
                 num_layers += 1;
             } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                 // ignore these
@@ -1590,7 +1590,7 @@ impl Timeline {
             // remote index file?
             // If so, rename_to_backup those files & replace their local layer with
             // a RemoteLayer in the layer map so that we re-download them on-demand.
-            if let Some(local_layer) = &local_layer {
+            if let Some(local_layer) = local_layer {
                 let local_layer_path = local_layer
                     .local_path()
                     .expect("caller must ensure that local_layers only contains local layers");
@@ -1615,6 +1615,7 @@ impl Timeline {
                         anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
                     } else {
                         self.metrics.resident_physical_size_gauge.sub(local_size);
+                        updates.remove_historic(local_layer);
                         // fall-through to adding the remote layer
                     }
                 } else {
@@ -1650,11 +1651,7 @@ impl Timeline {
                     );
                     let remote_layer = Arc::new(remote_layer);
 
-                    if let Some(local_layer) = &local_layer {
-                        updates.replace_historic(local_layer, remote_layer)?;
-                    } else {
-                        updates.insert_historic(remote_layer)?;
-                    }
+                    updates.insert_historic(remote_layer);
                 }
                 LayerFileName::Delta(deltafilename) => {
                     // Create a RemoteLayer for the delta file.
@@ -1678,11 +1675,7 @@ impl Timeline {
                         LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted),
                     );
                     let remote_layer = Arc::new(remote_layer);
-                    if let Some(local_layer) = &local_layer {
-                        updates.replace_historic(local_layer, remote_layer)?;
-                    } else {
-                        updates.insert_historic(remote_layer)?;
-                    }
+                    updates.insert_historic(remote_layer);
                 }
             }
         }
@@ -2730,7 +2723,7 @@ impl Timeline {
             .write()
             .unwrap()
             .batch_update()
-            .insert_historic(Arc::new(new_delta))?;
+            .insert_historic(Arc::new(new_delta));
 
         // update the timeline's physical size
         let sz = new_delta_path.metadata()?.len();
@@ -2935,7 +2928,7 @@ impl Timeline {
             self.metrics
                 .resident_physical_size_gauge
                 .add(metadata.len());
-            updates.insert_historic(Arc::new(l))?;
+            updates.insert_historic(Arc::new(l));
         }
         updates.flush();
         drop(layers);
@@ -3368,7 +3361,7 @@ impl Timeline {
 
             new_layer_paths.insert(new_delta_path, LayerFileMetadata::new(metadata.len()));
             let x: Arc<dyn PersistentLayer + 'static> = Arc::new(l);
-            updates.insert_historic(x)?;
+            updates.insert_historic(x);
         }
 
         // Now that we have reshuffled the data to set of new delta layers, we can

From 4c3ba1627b26178917880f23c4c9eede98960626 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Fri, 28 Apr 2023 20:16:02 +0300
Subject: [PATCH 050/209] Add 4 new Pageservers for retool launch

---
 .github/ansible/prod.us-west-2.hosts.yaml | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index 9cf847bcb1..1fde83520e 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -41,6 +41,14 @@ storage:
           ansible_host: i-051642d372c0a4f32
         pageserver-3.us-west-2.aws.neon.tech:
           ansible_host: i-00c3844beb9ad1c6b
+        pageserver-4.us-west-2.aws.neon.tech:
+          ansible_host: i-013263dd1c239adcc
+        pageserver-5.us-west-2.aws.neon.tech:
+          ansible_host: i-00ca6417c7bf96820
+        pageserver-6.us-west-2.aws.neon.tech:
+          ansible_host: i-01cdf7d2bc1433b6a
+        pageserver-7.us-west-2.aws.neon.tech:
+          ansible_host: i-02eec9b40617db5bc
 
     safekeepers:
       hosts:
@@ -50,4 +58,3 @@ storage:
           ansible_host: i-074682f9d3c712e7c
         safekeeper-2.us-west-2.aws.neon.tech:
           ansible_host: i-042b7efb1729d7966
-

From ff1119da6678d589026d3a74004f2dfe53411abe Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 1 May 2023 13:33:10 +0300
Subject: [PATCH 051/209] Add 2 new sets of safekeepers to us-west2

---
 .github/ansible/prod.us-west-2.hosts.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index 1fde83520e..be65d8e63c 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -58,3 +58,15 @@ storage:
           ansible_host: i-074682f9d3c712e7c
         safekeeper-2.us-west-2.aws.neon.tech:
           ansible_host: i-042b7efb1729d7966
+        safekeeper-3.us-west-2.aws.neon.tech:
+          ansible_host: i-089f6b9ef426dff76
+        safekeeper-4.us-west-2.aws.neon.tech:
+          ansible_host: i-0fe6bf912c4710c82
+        safekeeper-5.us-west-2.aws.neon.tech:
+          ansible_host: i-0a83c1c46d2b4e409
+        safekeeper-6.us-west-2.aws.neon.tech:
+          ansible_host: i-0fef5317b8fdc9f8d
+        safekeeper-7.us-west-2.aws.neon.tech:
+          ansible_host: i-0be739190d4289bf9
+        safekeeper-8.us-west-2.aws.neon.tech:
+          ansible_host: i-00e851803669e5cfe                    

From 840183e51fde41b6f3019d33655d9974d2f6880e Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 11 May 2023 15:08:39 +0300
Subject: [PATCH 052/209] try: higher page_service timeouts to isolate an issue

---
 pageserver/src/page_service.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index a7a0d1a22e..bd3ece2dfc 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -256,7 +256,10 @@ async fn page_service_conn_main(
     //
     // no write timeout is used, because the kernel is assumed to error writes after some time.
     let mut socket = tokio_io_timeout::TimeoutReader::new(socket);
-    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 10)));
+
+    // timeout should be lower, but trying out multiple days for
+    // <https://github.com/neondatabase/neon/issues/4205>
+    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
     let socket = std::pin::pin!(socket);
 
     // XXX: pgbackend.run() should take the connection_ctx,

From 85d6194aa40671566f349283f63a692d1861c640 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 16 May 2023 17:19:12 +0100
Subject: [PATCH 053/209] Fix regress-tests job for Postgres 15 on release
 branch (#4254)

## Problem

Compatibility tests don't support Postgres 15 yet, but we're still
trying to upload compatibility snapshot (which we do not collect).

Ref
https://github.com/neondatabase/neon/actions/runs/4991394158/jobs/8940369368#step:4:38129

## Summary of changes

Add `pg_version` parameter to `run-python-test-set` actions and do not
upload compatibility snapshot for Postgres 15
---
 .github/actions/run-python-test-set/action.yml | 11 ++++++++---
 .github/workflows/build_and_test.yml           |  2 +-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index d6c960bfda..bb120e9470 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -48,6 +48,10 @@ inputs:
     description: 'Whether to rerun flaky tests'
     required: false
     default: 'false'
+  pg_version:
+    description: 'Postgres version to use for tests'
+    required: false
+    default: 'v14'
 
 runs:
   using: "composite"
@@ -68,7 +72,7 @@ runs:
         prefix: latest
 
     - name: Download compatibility snapshot for Postgres 14
-      if: inputs.build_type != 'remote'
+      if: inputs.build_type != 'remote' && inputs.pg_version == 'v14'
       uses: ./.github/actions/download
       with:
         name: compatibility-snapshot-${{ inputs.build_type }}-pg14
@@ -106,13 +110,14 @@ runs:
         ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
         ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
         RERUN_FLAKY: ${{ inputs.rerun_flaky }}
+        PG_VERSION: ${{ inputs.pg_version }}
       shell: bash -euxo pipefail {0}
       run: |
         # PLATFORM will be embedded in the perf test report
         # and it is needed to distinguish different environments
         export PLATFORM=${PLATFORM:-github-actions-selfhosted}
         export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
-        export DEFAULT_PG_VERSION=${DEFAULT_PG_VERSION:-14}
+        export DEFAULT_PG_VERSION=${PG_VERSION#v}
 
         if [ "${BUILD_TYPE}" = "remote" ]; then
           export REMOTE_ENV=1
@@ -193,7 +198,7 @@ runs:
         fi
 
     - name: Upload compatibility snapshot for Postgres 14
-      if: github.ref_name == 'release'
+      if: github.ref_name == 'release' && inputs.pg_version == 'v14'
       uses: ./.github/actions/upload
       with:
         name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 5a09f0b4aa..ef9f171766 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -350,8 +350,8 @@ jobs:
           real_s3_access_key_id: "${{ secrets.AWS_ACCESS_KEY_ID_CI_TESTS_S3 }}"
           real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
           rerun_flaky: true
+          pg_version: ${{ matrix.pg_version }}
         env:
-          DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
 

From b0a77844f60cdcfb84648f2a7d40f0bb85c1f189 Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Fri, 14 Apr 2023 19:41:02 +0300
Subject: [PATCH 054/209] Add SQL-over-HTTP endpoint to Proxy

This commit introduces an SQL-over-HTTP endpoint in the proxy, with a JSON
response structure resembling that of the node-postgres driver. This method,
using HTTP POST, achieves smaller amortized latencies in edge setups due to
fewer round trips and an enhanced open connection reuse by the v8 engine.

This update involves several intricacies:
1. SQL injection protection: We employed the extended query protocol, modifying
   the rust-postgres driver to send queries in one roundtrip using a text
   protocol rather than binary, bypassing potential issues like those identified
   in https://github.com/sfackler/rust-postgres/issues/1030.

2. Postgres type compatibility: As not all postgres types have binary
   representations (e.g., acl's in pg_class), we adjusted rust-postgres to
   respond with text protocol, simplifying serialization and fixing queries with
   text-only types in response.

3. Data type conversion: Considering JSON supports fewer data types than
   Postgres, we perform conversions where possible, passing all other types as
   strings. Key conversions include:
   - postgres int2, int4, float4, float8 -> json number (NaN and Inf remain
     text)
   - postgres bool, null, text -> json bool, null, string
   - postgres array -> json array
   - postgres json and jsonb -> json object

4. Alignment with node-postgres: To facilitate integration with js libraries,
   we've matched the response structure of node-postgres, returning command tags
   and column oids. Command tag capturing was added to the rust-postgres
   functionality as part of this change.
---
 Cargo.lock                                    |  10 +-
 Cargo.toml                                    |  12 +-
 proxy/README.md                               |  86 ++-
 proxy/src/config.rs                           |   5 +-
 proxy/src/http.rs                             |   1 +
 proxy/src/http/sql_over_http.rs               | 603 ++++++++++++++++++
 proxy/src/http/websocket.rs                   |  86 ++-
 test_runner/fixtures/neon_fixtures.py         |  41 +-
 test_runner/regress/test_metric_collection.py |   2 +
 test_runner/regress/test_proxy.py             | 128 +++-
 10 files changed, 909 insertions(+), 65 deletions(-)
 create mode 100644 proxy/src/http/sql_over_http.rs

diff --git a/Cargo.lock b/Cargo.lock
index 55418473d5..4d63ebd99d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2820,7 +2820,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -2833,7 +2833,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "native-tls",
  "tokio",
@@ -2844,7 +2844,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -2862,7 +2862,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4321,7 +4321,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=0bc41d8503c092b040142214aac3cf7d11d0c19f#0bc41d8503c092b040142214aac3cf7d11d0c19f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/Cargo.toml b/Cargo.toml
index c901532f86..7895459841 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -126,11 +126,11 @@ env_logger = "0.10"
 log = "0.4"
 
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
 tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
 
 ## Other git libraries
@@ -166,7 +166,7 @@ tonic-build = "0.9"
 
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="0bc41d8503c092b040142214aac3cf7d11d0c19f" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
 
 # Changes the MAX_THREADS limit from 4096 to 32768.
 # This is a temporary workaround for using tracing from many threads in safekeepers code,
diff --git a/proxy/README.md b/proxy/README.md
index 4ead098b73..cd76a2443f 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -1,6 +1,6 @@
 # Proxy
 
-Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme and cluster routing method. Following backends are currently implemented:
+Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme and cluster routing method. Following routing backends are currently implemented:
 
 * console
   new SCRAM-based console API; uses SNI info to select the destination project (endpoint soon)
@@ -9,6 +9,90 @@ Proxy binary accepts `--auth-backend` CLI option, which determines auth scheme a
 * link
   sends login link for all usernames
 
+Also proxy can expose following services to the external world:
+
+* postgres protocol over TCP -- usual postgres endpoint compatible with usual
+  postgres drivers
+* postgres protocol over WebSockets -- same protocol tunneled over websockets
+  for environments where TCP connection is not available. We have our own
+  implementation of a client that uses node-postgres and tunnels traffic through
+  websockets: https://github.com/neondatabase/serverless
+* SQL over HTTP -- service that accepts POST requests with SQL text over HTTP
+  and responds with JSON-serialised results.
+
+
+## SQL over HTTP
+
+Contrary to the usual postgres proto over TCP and WebSockets using plain
+one-shot HTTP request achieves smaller amortized latencies in edge setups due to
+fewer round trips and an enhanced open connection reuse by the v8 engine. Also
+such endpoint could be used directly without any driver.
+
+To play with it locally one may start proxy over a local postgres installation
+(see end of this page on how to generate certs with openssl):
+
+```
+./target/debug/proxy -c server.crt -k server.key --auth-backend=postgres --auth-endpoint=postgres://stas@127.0.0.1:5432/stas --wss 0.0.0.0:4444
+```
+
+If both postgres and proxy are running you may send a SQL query:
+```json
+curl -k -X POST 'https://proxy.localtest.me:4444/sql' \
+  -H 'Neon-Connection-String: postgres://stas:pass@proxy.localtest.me:4444/postgres' \
+  -H 'Content-Type: application/json' \
+  --data '{
+    "query":"SELECT $1::int[] as arr, $2::jsonb as obj, 42 as num",
+    "params":[ "{{1,2},{\"3\",4}}", {"key":"val", "ikey":4242}]
+  }' | jq
+
+{
+  "command": "SELECT",
+  "fields": [
+    { "dataTypeID": 1007, "name": "arr" },
+    { "dataTypeID": 3802, "name": "obj" },
+    { "dataTypeID": 23, "name": "num" }
+  ],
+  "rowCount": 1,
+  "rows": [
+    {
+      "arr": [[1,2],[3,4]],
+      "num": 42,
+      "obj": {
+        "ikey": 4242,
+        "key": "val"
+      }
+    }
+  ]
+}
+```
+
+
+With the current approach we made the following design decisions:
+
+1. SQL injection protection: We employed the extended query protocol, modifying
+   the rust-postgres driver to send queries in one roundtrip using a text
+   protocol rather than binary, bypassing potential issues like those identified
+   in sfackler/rust-postgres#1030.
+
+2. Postgres type compatibility: As not all postgres types have binary
+   representations (e.g., acl's in pg_class), we adjusted rust-postgres to
+   respond with text protocol, simplifying serialization and fixing queries with
+   text-only types in response.
+
+3. Data type conversion: Considering JSON supports fewer data types than
+   Postgres, we perform conversions where possible, passing all other types as
+   strings. Key conversions include:
+   - postgres int2, int4, float4, float8 -> json number (NaN and Inf remain
+     text)
+   - postgres bool, null, text -> json bool, null, string
+   - postgres array -> json array
+   - postgres json and jsonb -> json object
+
+4. Alignment with node-postgres: To facilitate integration with js libraries,
+   we've matched the response structure of node-postgres, returning command tags
+   and column oids. Command tag capturing was added to the rust-postgres
+   functionality as part of this change.
+
 ## Using SNI-based routing on localhost
 
 Now proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy:
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 530229b3fd..6a26cea78e 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -100,9 +100,10 @@ impl CertResolver {
         is_default: bool,
     ) -> anyhow::Result<()> {
         let priv_key = {
-            let key_bytes = std::fs::read(key_path).context("TLS key file")?;
-            let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
+            let key_bytes = std::fs::read(key_path)
                 .context(format!("Failed to read TLS keys at '{key_path}'"))?;
+            let mut keys = rustls_pemfile::pkcs8_private_keys(&mut &key_bytes[..])
+                .context(format!("Failed to parse TLS keys at '{key_path}'"))?;
 
             ensure!(keys.len() == 1, "keys.len() = {} (should be 1)", keys.len());
             keys.pop().map(rustls::PrivateKey).unwrap()
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index a544157800..5cf49b669c 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -3,6 +3,7 @@
 //! directly relying on deps like `reqwest` (think loose coupling).
 
 pub mod server;
+pub mod sql_over_http;
 pub mod websocket;
 
 pub use reqwest::{Request, Response, StatusCode};
diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
new file mode 100644
index 0000000000..0438a82c12
--- /dev/null
+++ b/proxy/src/http/sql_over_http.rs
@@ -0,0 +1,603 @@
+use futures::pin_mut;
+use futures::StreamExt;
+use hyper::body::HttpBody;
+use hyper::{Body, HeaderMap, Request};
+use pq_proto::StartupMessageParams;
+use serde_json::json;
+use serde_json::Map;
+use serde_json::Value;
+use tokio_postgres::types::Kind;
+use tokio_postgres::types::Type;
+use tokio_postgres::Row;
+use url::Url;
+
+use crate::{auth, config::ProxyConfig, console};
+
+#[derive(serde::Deserialize)]
+struct QueryData {
+    query: String,
+    params: Vec<serde_json::Value>,
+}
+
+const APP_NAME: &str = "sql_over_http";
+const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
+const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
+
+//
+// Convert json non-string types to strings, so that they can be passed to Postgres
+// as parameters.
+//
+fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<String>, serde_json::Error> {
+    json.iter()
+        .map(|value| {
+            match value {
+                Value::Null => serde_json::to_string(value),
+                Value::Bool(_) => serde_json::to_string(value),
+                Value::Number(_) => serde_json::to_string(value),
+                Value::Object(_) => serde_json::to_string(value),
+
+                // no need to escape
+                Value::String(s) => Ok(s.to_string()),
+
+                // special care for arrays
+                Value::Array(_) => json_array_to_pg_array(value),
+            }
+        })
+        .collect()
+}
+
+//
+// Serialize a JSON array to a Postgres array. Contrary to the strings in the params
+// in the array we need to escape the strings. Postgres is okay with arrays of form
+// '{1,"2",3}'::int[], so we don't check that array holds values of the same type, leaving
+// it for Postgres to check.
+//
+// Example of the same escaping in node-postgres: packages/pg/lib/utils.js
+//
+fn json_array_to_pg_array(value: &Value) -> Result<String, serde_json::Error> {
+    match value {
+        // same
+        Value::Null => serde_json::to_string(value),
+        Value::Bool(_) => serde_json::to_string(value),
+        Value::Number(_) => serde_json::to_string(value),
+        Value::Object(_) => serde_json::to_string(value),
+
+        // now needs to be escaped, as it is part of the array
+        Value::String(_) => serde_json::to_string(value),
+
+        // recurse into array
+        Value::Array(arr) => {
+            let vals = arr
+                .iter()
+                .map(json_array_to_pg_array)
+                .collect::<Result<Vec<_>, _>>()?
+                .join(",");
+            Ok(format!("{{{}}}", vals))
+        }
+    }
+}
+
+fn get_conn_info(
+    headers: &HeaderMap,
+    sni_hostname: Option<String>,
+) -> Result<(String, String, String, String), anyhow::Error> {
+    let connection_string = headers
+        .get("Neon-Connection-String")
+        .ok_or(anyhow::anyhow!("missing connection string"))?
+        .to_str()?;
+
+    let connection_url = Url::parse(connection_string)?;
+
+    let protocol = connection_url.scheme();
+    if protocol != "postgres" && protocol != "postgresql" {
+        return Err(anyhow::anyhow!(
+            "connection string must start with postgres: or postgresql:"
+        ));
+    }
+
+    let mut url_path = connection_url
+        .path_segments()
+        .ok_or(anyhow::anyhow!("missing database name"))?;
+
+    let dbname = url_path
+        .next()
+        .ok_or(anyhow::anyhow!("invalid database name"))?;
+
+    let username = connection_url.username();
+    if username.is_empty() {
+        return Err(anyhow::anyhow!("missing username"));
+    }
+
+    let password = connection_url
+        .password()
+        .ok_or(anyhow::anyhow!("no password"))?;
+
+    // TLS certificate selector now based on SNI hostname, so if we are running here
+    // we are sure that SNI hostname is set to one of the configured domain names.
+    let sni_hostname = sni_hostname.ok_or(anyhow::anyhow!("no SNI hostname set"))?;
+
+    let hostname = connection_url
+        .host_str()
+        .ok_or(anyhow::anyhow!("no host"))?;
+
+    let host_header = headers
+        .get("host")
+        .and_then(|h| h.to_str().ok())
+        .and_then(|h| h.split(':').next());
+
+    if hostname != sni_hostname {
+        return Err(anyhow::anyhow!("mismatched SNI hostname and hostname"));
+    } else if let Some(h) = host_header {
+        if h != hostname {
+            return Err(anyhow::anyhow!("mismatched host header and hostname"));
+        }
+    }
+
+    Ok((
+        username.to_owned(),
+        dbname.to_owned(),
+        hostname.to_owned(),
+        password.to_owned(),
+    ))
+}
+
+// TODO: return different http error codes
+pub async fn handle(
+    config: &'static ProxyConfig,
+    request: Request<Body>,
+    sni_hostname: Option<String>,
+) -> anyhow::Result<Value> {
+    //
+    // Determine the destination and connection params
+    //
+    let headers = request.headers();
+    let (username, dbname, hostname, password) = get_conn_info(headers, sni_hostname)?;
+    let credential_params = StartupMessageParams::new([
+        ("user", &username),
+        ("database", &dbname),
+        ("application_name", APP_NAME),
+    ]);
+
+    //
+    // Wake up the destination if needed. Code here is a bit involved because
+    // we reuse the code from the usual proxy and we need to prepare few structures
+    // that this code expects.
+    //
+    let tls = config.tls_config.as_ref();
+    let common_names = tls.and_then(|tls| tls.common_names.clone());
+    let creds = config
+        .auth_backend
+        .as_ref()
+        .map(|_| auth::ClientCredentials::parse(&credential_params, Some(&hostname), common_names))
+        .transpose()?;
+    let extra = console::ConsoleReqExtra {
+        session_id: uuid::Uuid::new_v4(),
+        application_name: Some(APP_NAME),
+    };
+    let node = creds.wake_compute(&extra).await?.expect("msg");
+    let conf = node.value.config;
+    let port = *conf.get_ports().first().expect("no port");
+    let host = match conf.get_hosts().first().expect("no host") {
+        tokio_postgres::config::Host::Tcp(host) => host,
+        tokio_postgres::config::Host::Unix(_) => {
+            return Err(anyhow::anyhow!("unix socket is not supported"));
+        }
+    };
+
+    let request_content_length = match request.body().size_hint().upper() {
+        Some(v) => v,
+        None => MAX_REQUEST_SIZE + 1,
+    };
+
+    if request_content_length > MAX_REQUEST_SIZE {
+        return Err(anyhow::anyhow!(
+            "request is too large (max {MAX_REQUEST_SIZE} bytes)"
+        ));
+    }
+
+    //
+    // Read the query and query params from the request body
+    //
+    let body = hyper::body::to_bytes(request.into_body()).await?;
+    let QueryData { query, params } = serde_json::from_slice(&body)?;
+    let query_params = json_to_pg_text(params)?;
+
+    //
+    // Connenct to the destination
+    //
+    let (client, connection) = tokio_postgres::Config::new()
+        .host(host)
+        .port(port)
+        .user(&username)
+        .password(&password)
+        .dbname(&dbname)
+        .max_backend_message_size(MAX_RESPONSE_SIZE)
+        .connect(tokio_postgres::NoTls)
+        .await?;
+
+    tokio::spawn(async move {
+        if let Err(e) = connection.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    //
+    // Now execute the query and return the result
+    //
+    let row_stream = client.query_raw_txt(query, query_params).await?;
+
+    // Manually drain the stream into a vector to leave row_stream hanging
+    // around to get a command tag. Also check that the response is not too
+    // big.
+    pin_mut!(row_stream);
+    let mut rows: Vec<tokio_postgres::Row> = Vec::new();
+    let mut curret_size = 0;
+    while let Some(row) = row_stream.next().await {
+        let row = row?;
+        curret_size += row.body_len();
+        rows.push(row);
+        if curret_size > MAX_RESPONSE_SIZE {
+            return Err(anyhow::anyhow!("response too large"));
+        }
+    }
+
+    // grab the command tag and number of rows affected
+    let command_tag = row_stream.command_tag().unwrap_or_default();
+    let mut command_tag_split = command_tag.split(' ');
+    let command_tag_name = command_tag_split.next().unwrap_or_default();
+    let command_tag_count = if command_tag_name == "INSERT" {
+        // INSERT returns OID first and then number of rows
+        command_tag_split.nth(1)
+    } else {
+        // other commands return number of rows (if any)
+        command_tag_split.next()
+    }
+    .and_then(|s| s.parse::<i64>().ok());
+
+    let fields = if !rows.is_empty() {
+        rows[0]
+            .columns()
+            .iter()
+            .map(|c| {
+                json!({
+                    "name": Value::String(c.name().to_owned()),
+                    "dataTypeID": Value::Number(c.type_().oid().into()),
+                })
+            })
+            .collect::<Vec<_>>()
+    } else {
+        Vec::new()
+    };
+
+    // convert rows to JSON
+    let rows = rows
+        .iter()
+        .map(pg_text_row_to_json)
+        .collect::<Result<Vec<_>, _>>()?;
+
+    // resulting JSON format is based on the format of node-postgres result
+    Ok(json!({
+        "command": command_tag_name,
+        "rowCount": command_tag_count,
+        "rows": rows,
+        "fields": fields,
+    }))
+}
+
+//
+// Convert postgres row with text-encoded values to JSON object
+//
+pub fn pg_text_row_to_json(row: &Row) -> Result<Value, anyhow::Error> {
+    let res = row
+        .columns()
+        .iter()
+        .enumerate()
+        .map(|(i, column)| {
+            let name = column.name();
+            let pg_value = row.as_text(i)?;
+            let json_value = pg_text_to_json(pg_value, column.type_())?;
+            Ok((name.to_string(), json_value))
+        })
+        .collect::<Result<Map<String, Value>, anyhow::Error>>()?;
+
+    Ok(Value::Object(res))
+}
+
+//
+// Convert postgres text-encoded value to JSON value
+//
+pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
+    if let Some(val) = pg_value {
+        if val == "NULL" {
+            return Ok(Value::Null);
+        }
+
+        if let Kind::Array(elem_type) = pg_type.kind() {
+            return pg_array_parse(val, elem_type);
+        }
+
+        match *pg_type {
+            Type::BOOL => Ok(Value::Bool(val == "t")),
+            Type::INT2 | Type::INT4 => {
+                let val = val.parse::<i32>()?;
+                Ok(Value::Number(serde_json::Number::from(val)))
+            }
+            Type::FLOAT4 | Type::FLOAT8 => {
+                let fval = val.parse::<f64>()?;
+                let num = serde_json::Number::from_f64(fval);
+                if let Some(num) = num {
+                    Ok(Value::Number(num))
+                } else {
+                    // Pass Nan, Inf, -Inf as strings
+                    // JS JSON.stringify() does converts them to null, but we
+                    // want to preserve them, so we pass them as strings
+                    Ok(Value::String(val.to_string()))
+                }
+            }
+            Type::JSON | Type::JSONB => Ok(serde_json::from_str(val)?),
+            _ => Ok(Value::String(val.to_string())),
+        }
+    } else {
+        Ok(Value::Null)
+    }
+}
+
+//
+// Parse postgres array into JSON array.
+//
+// This is a bit involved because we need to handle nested arrays and quoted
+// values. Unlike postgres we don't check that all nested arrays have the same
+// dimensions, we just return them as is.
+//
+fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result<Value, anyhow::Error> {
+    _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v)
+}
+
+fn _pg_array_parse(
+    pg_array: &str,
+    elem_type: &Type,
+    nested: bool,
+) -> Result<(Value, usize), anyhow::Error> {
+    let mut pg_array_chr = pg_array.char_indices();
+    let mut level = 0;
+    let mut quote = false;
+    let mut entries: Vec<Value> = Vec::new();
+    let mut entry = String::new();
+
+    // skip bounds decoration
+    if let Some('[') = pg_array.chars().next() {
+        for (_, c) in pg_array_chr.by_ref() {
+            if c == '=' {
+                break;
+            }
+        }
+    }
+
+    while let Some((mut i, mut c)) = pg_array_chr.next() {
+        let mut escaped = false;
+
+        if c == '\\' {
+            escaped = true;
+            (i, c) = pg_array_chr.next().unwrap();
+        }
+
+        match c {
+            '{' if !quote => {
+                level += 1;
+                if level > 1 {
+                    let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?;
+                    entries.push(res);
+                    for _ in 0..off - 1 {
+                        pg_array_chr.next();
+                    }
+                }
+            }
+            '}' => {
+                level -= 1;
+                if level == 0 {
+                    if !entry.is_empty() {
+                        entries.push(pg_text_to_json(Some(&entry), elem_type)?);
+                    }
+                    if nested {
+                        return Ok((Value::Array(entries), i));
+                    }
+                }
+            }
+            '"' if !escaped => {
+                if quote {
+                    // push even if empty
+                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
+                    entry = String::new();
+                }
+                quote = !quote;
+            }
+            ',' if !quote => {
+                if !entry.is_empty() {
+                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
+                    entry = String::new();
+                }
+            }
+            _ => {
+                entry.push(c);
+            }
+        }
+    }
+
+    if level != 0 {
+        return Err(anyhow::anyhow!("unbalanced array"));
+    }
+
+    Ok((Value::Array(entries), 0))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use serde_json::json;
+
+    #[test]
+    fn test_atomic_types_to_pg_params() {
+        let json = vec![Value::Bool(true), Value::Bool(false)];
+        let pg_params = json_to_pg_text(json).unwrap();
+        assert_eq!(pg_params, vec!["true", "false"]);
+
+        let json = vec![Value::Number(serde_json::Number::from(42))];
+        let pg_params = json_to_pg_text(json).unwrap();
+        assert_eq!(pg_params, vec!["42"]);
+
+        let json = vec![Value::String("foo\"".to_string())];
+        let pg_params = json_to_pg_text(json).unwrap();
+        assert_eq!(pg_params, vec!["foo\""]);
+
+        let json = vec![Value::Null];
+        let pg_params = json_to_pg_text(json).unwrap();
+        assert_eq!(pg_params, vec!["null"]);
+    }
+
+    #[test]
+    fn test_json_array_to_pg_array() {
+        // atoms and escaping
+        let json = "[true, false, null, 42, \"foo\", \"bar\\\"-\\\\\"]";
+        let json: Value = serde_json::from_str(json).unwrap();
+        let pg_params = json_to_pg_text(vec![json]).unwrap();
+        assert_eq!(
+            pg_params,
+            vec!["{true,false,null,42,\"foo\",\"bar\\\"-\\\\\"}"]
+        );
+
+        // nested arrays
+        let json = "[[true, false], [null, 42], [\"foo\", \"bar\\\"-\\\\\"]]";
+        let json: Value = serde_json::from_str(json).unwrap();
+        let pg_params = json_to_pg_text(vec![json]).unwrap();
+        assert_eq!(
+            pg_params,
+            vec!["{{true,false},{null,42},{\"foo\",\"bar\\\"-\\\\\"}}"]
+        );
+    }
+
+    #[test]
+    fn test_atomic_types_parse() {
+        assert_eq!(
+            pg_text_to_json(Some("foo"), &Type::TEXT).unwrap(),
+            json!("foo")
+        );
+        assert_eq!(pg_text_to_json(None, &Type::TEXT).unwrap(), json!(null));
+        assert_eq!(pg_text_to_json(Some("42"), &Type::INT4).unwrap(), json!(42));
+        assert_eq!(pg_text_to_json(Some("42"), &Type::INT2).unwrap(), json!(42));
+        assert_eq!(
+            pg_text_to_json(Some("42"), &Type::INT8).unwrap(),
+            json!("42")
+        );
+        assert_eq!(
+            pg_text_to_json(Some("42.42"), &Type::FLOAT8).unwrap(),
+            json!(42.42)
+        );
+        assert_eq!(
+            pg_text_to_json(Some("42.42"), &Type::FLOAT4).unwrap(),
+            json!(42.42)
+        );
+        assert_eq!(
+            pg_text_to_json(Some("NaN"), &Type::FLOAT4).unwrap(),
+            json!("NaN")
+        );
+        assert_eq!(
+            pg_text_to_json(Some("Infinity"), &Type::FLOAT4).unwrap(),
+            json!("Infinity")
+        );
+        assert_eq!(
+            pg_text_to_json(Some("-Infinity"), &Type::FLOAT4).unwrap(),
+            json!("-Infinity")
+        );
+
+        let json: Value =
+            serde_json::from_str("{\"s\":\"str\",\"n\":42,\"f\":4.2,\"a\":[null,3,\"a\"]}")
+                .unwrap();
+        assert_eq!(
+            pg_text_to_json(
+                Some(r#"{"s":"str","n":42,"f":4.2,"a":[null,3,"a"]}"#),
+                &Type::JSONB
+            )
+            .unwrap(),
+            json
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_text() {
+        fn pt(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::TEXT).unwrap()
+        }
+        assert_eq!(
+            pt(r#"{"aa\"\\\,a",cha,"bbbb"}"#),
+            json!(["aa\"\\,a", "cha", "bbbb"])
+        );
+        assert_eq!(
+            pt(r#"{{"foo","bar"},{"bee","bop"}}"#),
+            json!([["foo", "bar"], ["bee", "bop"]])
+        );
+        assert_eq!(
+            pt(r#"{{{{"foo",NULL,"bop",bup}}}}"#),
+            json!([[[["foo", null, "bop", "bup"]]]])
+        );
+        assert_eq!(
+            pt(r#"{{"1",2,3},{4,NULL,6},{NULL,NULL,NULL}}"#),
+            json!([["1", "2", "3"], ["4", null, "6"], [null, null, null]])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_bool() {
+        fn pb(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::BOOL).unwrap()
+        }
+        assert_eq!(pb(r#"{t,f,t}"#), json!([true, false, true]));
+        assert_eq!(pb(r#"{{t,f,t}}"#), json!([[true, false, true]]));
+        assert_eq!(
+            pb(r#"{{t,f},{f,t}}"#),
+            json!([[true, false], [false, true]])
+        );
+        assert_eq!(
+            pb(r#"{{t,NULL},{NULL,f}}"#),
+            json!([[true, null], [null, false]])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_parse_numbers() {
+        fn pn(pg_arr: &str, ty: &Type) -> Value {
+            pg_array_parse(pg_arr, ty).unwrap()
+        }
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT4), json!([1, 2, 3]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT2), json!([1, 2, 3]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::INT8), json!(["1", "2", "3"]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT4), json!([1.0, 2.0, 3.0]));
+        assert_eq!(pn(r#"{1,2,3}"#, &Type::FLOAT8), json!([1.0, 2.0, 3.0]));
+        assert_eq!(
+            pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT4),
+            json!([1.1, 2.2, 3.3])
+        );
+        assert_eq!(
+            pn(r#"{1.1,2.2,3.3}"#, &Type::FLOAT8),
+            json!([1.1, 2.2, 3.3])
+        );
+        assert_eq!(
+            pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT4),
+            json!(["NaN", "Infinity", "-Infinity"])
+        );
+        assert_eq!(
+            pn(r#"{NaN,Infinity,-Infinity}"#, &Type::FLOAT8),
+            json!(["NaN", "Infinity", "-Infinity"])
+        );
+    }
+
+    #[test]
+    fn test_pg_array_with_decoration() {
+        fn p(pg_arr: &str) -> Value {
+            pg_array_parse(pg_arr, &Type::INT2).unwrap()
+        }
+        assert_eq!(
+            p(r#"[1:1][-2:-1][3:5]={{{1,2,3},{4,5,6}}}"#),
+            json!([[[1, 2, 3], [4, 5, 6]]])
+        );
+    }
+}
diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
index c7676e8e14..fbb602e3d2 100644
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -4,12 +4,17 @@ use crate::{
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
 use hyper::{
-    server::{accept, conn::AddrIncoming},
+    server::{
+        accept,
+        conn::{AddrIncoming, AddrStream},
+    },
     upgrade::Upgraded,
-    Body, Request, Response, StatusCode,
+    Body, Method, Request, Response, StatusCode,
 };
 use hyper_tungstenite::{tungstenite::Message, HyperWebsocket, WebSocketStream};
 use pin_project_lite::pin_project;
+use serde_json::{json, Value};
+
 use std::{
     convert::Infallible,
     future::ready,
@@ -21,6 +26,7 @@ use tls_listener::TlsListener;
 use tokio::{
     io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf},
     net::TcpListener,
+    select,
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -30,6 +36,8 @@ use utils::http::{error::ApiError, json::json_response};
 // Tracking issue: https://github.com/rust-lang/rust/issues/98407.
 use sync_wrapper::SyncWrapper;
 
+use super::sql_over_http;
+
 pin_project! {
     /// This is a wrapper around a [`WebSocketStream`] that
     /// implements [`AsyncRead`] and [`AsyncWrite`].
@@ -159,6 +167,7 @@ async fn ws_handler(
     config: &'static ProxyConfig,
     cancel_map: Arc<CancelMap>,
     session_id: uuid::Uuid,
+    sni_hostname: Option<String>,
 ) -> Result<Response<Body>, ApiError> {
     let host = request
         .headers()
@@ -181,8 +190,44 @@ async fn ws_handler(
 
         // Return the response so the spawned future can continue.
         Ok(response)
+    // TODO: that deserves a refactor as now this function also handles http json client besides websockets.
+    // Right now I don't want to blow up sql-over-http patch with file renames and do that as a follow up instead.
+    } else if request.uri().path() == "/sql" && request.method() == Method::POST {
+        let result = select! {
+            _ = tokio::time::sleep(std::time::Duration::from_secs(10)) => {
+                Err(anyhow::anyhow!("Query timed out"))
+            }
+            response = sql_over_http::handle(config, request, sni_hostname) => {
+                response
+            }
+        };
+        let status_code = match result {
+            Ok(_) => StatusCode::OK,
+            Err(_) => StatusCode::BAD_REQUEST,
+        };
+        let json = match result {
+            Ok(r) => r,
+            Err(e) => {
+                let message = format!("{:?}", e);
+                let code = match e.downcast_ref::<tokio_postgres::Error>() {
+                    Some(e) => match e.code() {
+                        Some(e) => serde_json::to_value(e.code()).unwrap(),
+                        None => Value::Null,
+                    },
+                    None => Value::Null,
+                };
+                json!({ "message": message, "code": code })
+            }
+        };
+        json_response(status_code, json).map(|mut r| {
+            r.headers_mut().insert(
+                "Access-Control-Allow-Origin",
+                hyper::http::HeaderValue::from_static("*"),
+            );
+            r
+        })
     } else {
-        json_response(StatusCode::OK, "Connect with a websocket client")
+        json_response(StatusCode::BAD_REQUEST, "query is not supported")
     }
 }
 
@@ -216,20 +261,27 @@ pub async fn task_main(
         }
     });
 
-    let make_svc = hyper::service::make_service_fn(|_stream| async move {
-        Ok::<_, Infallible>(hyper::service::service_fn(
-            move |req: Request<Body>| async move {
-                let cancel_map = Arc::new(CancelMap::default());
-                let session_id = uuid::Uuid::new_v4();
-                ws_handler(req, config, cancel_map, session_id)
-                    .instrument(info_span!(
-                        "ws-client",
-                        session = format_args!("{session_id}")
-                    ))
-                    .await
-            },
-        ))
-    });
+    let make_svc =
+        hyper::service::make_service_fn(|stream: &tokio_rustls::server::TlsStream<AddrStream>| {
+            let sni_name = stream.get_ref().1.sni_hostname().map(|s| s.to_string());
+
+            async move {
+                Ok::<_, Infallible>(hyper::service::service_fn(move |req: Request<Body>| {
+                    let sni_name = sni_name.clone();
+                    async move {
+                        let cancel_map = Arc::new(CancelMap::default());
+                        let session_id = uuid::Uuid::new_v4();
+
+                        ws_handler(req, config, cancel_map, session_id, sni_name)
+                            .instrument(info_span!(
+                                "ws-client",
+                                session = format_args!("{session_id}")
+                            ))
+                            .await
+                    }
+                }))
+            }
+        });
 
     hyper::Server::builder(accept::from_stream(tls_listener))
         .serve(make_svc)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 8ec17834ac..bde91e6783 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2042,15 +2042,19 @@ class NeonProxy(PgProtocol):
         proxy_port: int,
         http_port: int,
         mgmt_port: int,
+        external_http_port: int,
         auth_backend: NeonProxy.AuthBackend,
         metric_collection_endpoint: Optional[str] = None,
         metric_collection_interval: Optional[str] = None,
     ):
         host = "127.0.0.1"
-        super().__init__(dsn=auth_backend.default_conn_url, host=host, port=proxy_port)
+        domain = "proxy.localtest.me"  # resolves to 127.0.0.1
+        super().__init__(dsn=auth_backend.default_conn_url, host=domain, port=proxy_port)
 
+        self.domain = domain
         self.host = host
         self.http_port = http_port
+        self.external_http_port = external_http_port
         self.neon_binpath = neon_binpath
         self.test_output_dir = test_output_dir
         self.proxy_port = proxy_port
@@ -2062,11 +2066,42 @@ class NeonProxy(PgProtocol):
 
     def start(self) -> NeonProxy:
         assert self._popen is None
+
+        # generate key of it doesn't exist
+        crt_path = self.test_output_dir / "proxy.crt"
+        key_path = self.test_output_dir / "proxy.key"
+
+        if not key_path.exists():
+            r = subprocess.run(
+                [
+                    "openssl",
+                    "req",
+                    "-new",
+                    "-x509",
+                    "-days",
+                    "365",
+                    "-nodes",
+                    "-text",
+                    "-out",
+                    str(crt_path),
+                    "-keyout",
+                    str(key_path),
+                    "-subj",
+                    "/CN=*.localtest.me",
+                    "-addext",
+                    "subjectAltName = DNS:*.localtest.me",
+                ]
+            )
+            assert r.returncode == 0
+
         args = [
             str(self.neon_binpath / "proxy"),
             *["--http", f"{self.host}:{self.http_port}"],
             *["--proxy", f"{self.host}:{self.proxy_port}"],
             *["--mgmt", f"{self.host}:{self.mgmt_port}"],
+            *["--wss", f"{self.host}:{self.external_http_port}"],
+            *["-c", str(crt_path)],
+            *["-k", str(key_path)],
             *self.auth_backend.extra_args(),
         ]
 
@@ -2190,6 +2225,7 @@ def link_proxy(
     http_port = port_distributor.get_port()
     proxy_port = port_distributor.get_port()
     mgmt_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
 
     with NeonProxy(
         neon_binpath=neon_binpath,
@@ -2197,6 +2233,7 @@ def link_proxy(
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
+        external_http_port=external_http_port,
         auth_backend=NeonProxy.Link(),
     ) as proxy:
         proxy.start()
@@ -2224,6 +2261,7 @@ def static_proxy(
     proxy_port = port_distributor.get_port()
     mgmt_port = port_distributor.get_port()
     http_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
 
     with NeonProxy(
         neon_binpath=neon_binpath,
@@ -2231,6 +2269,7 @@ def static_proxy(
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
+        external_http_port=external_http_port,
         auth_backend=NeonProxy.Postgres(auth_endpoint),
     ) as proxy:
         proxy.start()
diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py
index 1231188896..00ea77f2e7 100644
--- a/test_runner/regress/test_metric_collection.py
+++ b/test_runner/regress/test_metric_collection.py
@@ -204,6 +204,7 @@ def proxy_with_metric_collector(
     http_port = port_distributor.get_port()
     proxy_port = port_distributor.get_port()
     mgmt_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
 
     (host, port) = httpserver_listen_address
     metric_collection_endpoint = f"http://{host}:{port}/billing/api/v1/usage_events"
@@ -215,6 +216,7 @@ def proxy_with_metric_collector(
         proxy_port=proxy_port,
         http_port=http_port,
         mgmt_port=mgmt_port,
+        external_http_port=external_http_port,
         metric_collection_endpoint=metric_collection_endpoint,
         metric_collection_interval=metric_collection_interval,
         auth_backend=NeonProxy.Link(),
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index ae914e384e..6be3995714 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -1,22 +1,32 @@
+import json
 import subprocess
+from typing import Any, List
 
 import psycopg2
 import pytest
+import requests
 from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres
 
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
-def test_proxy_select_1(static_proxy: NeonProxy, option_name: str):
+def test_proxy_select_1(static_proxy: NeonProxy):
     """
     A simplest smoke test: check proxy against a local postgres instance.
     """
 
-    out = static_proxy.safe_psql("select 1", options=f"{option_name}=generic-project-name")
+    # no SNI, deprecated `options=project` syntax (before we had several endpoint in project)
+    out = static_proxy.safe_psql("select 1", sslsni=0, options="project=generic-project-name")
     assert out[0][0] == 1
 
+    # no SNI, new `options=endpoint` syntax
+    out = static_proxy.safe_psql("select 1", sslsni=0, options="endpoint=generic-project-name")
+    assert out[0][0] == 1
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
-def test_password_hack(static_proxy: NeonProxy, option_name: str):
+    # with SNI
+    out = static_proxy.safe_psql("select 42", host="generic-project-name.localtest.me")
+    assert out[0][0] == 42
+
+
+def test_password_hack(static_proxy: NeonProxy):
     """
     Check the PasswordHack auth flow: an alternative to SCRAM auth for
     clients which can't provide the project/endpoint name via SNI or `options`.
@@ -24,14 +34,16 @@ def test_password_hack(static_proxy: NeonProxy, option_name: str):
 
     user = "borat"
     password = "password"
-    static_proxy.safe_psql(
-        f"create role {user} with login password '{password}'",
-        options=f"{option_name}=irrelevant",
-    )
+    static_proxy.safe_psql(f"create role {user} with login password '{password}'")
 
     # Note the format of `magic`!
-    magic = f"{option_name}=irrelevant;{password}"
-    static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)
+    magic = f"project=irrelevant;{password}"
+    out = static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)
+    assert out[0][0] == 1
+
+    magic = f"endpoint=irrelevant;{password}"
+    out = static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)
+    assert out[0][0] == 1
 
     # Must also check that invalid magic won't be accepted.
     with pytest.raises(psycopg2.OperationalError):
@@ -69,52 +81,55 @@ def test_proxy_options(static_proxy: NeonProxy, option_name: str):
     """
 
     options = f"{option_name}=irrelevant -cproxytest.option=value"
-    out = static_proxy.safe_psql("show proxytest.option", options=options)
+    out = static_proxy.safe_psql("show proxytest.option", options=options, sslsni=0)
     assert out[0][0] == "value"
 
     options = f"-c proxytest.foo=\\ str {option_name}=irrelevant"
+    out = static_proxy.safe_psql("show proxytest.foo", options=options, sslsni=0)
+    assert out[0][0] == " str"
+
+    options = "-cproxytest.option=value"
+    out = static_proxy.safe_psql("show proxytest.option", options=options)
+    assert out[0][0] == "value"
+
+    options = "-c proxytest.foo=\\ str"
     out = static_proxy.safe_psql("show proxytest.foo", options=options)
     assert out[0][0] == " str"
 
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
-def test_auth_errors(static_proxy: NeonProxy, option_name: str):
+def test_auth_errors(static_proxy: NeonProxy):
     """
     Check that we throw very specific errors in some unsuccessful auth scenarios.
     """
 
     # User does not exist
     with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", options=f"{option_name}=irrelevant")
+        static_proxy.connect(user="pinocchio")
     text = str(exprinfo.value).strip()
-    assert text.endswith("password authentication failed for user 'pinocchio'")
+    assert text.find("password authentication failed for user 'pinocchio'") != -1
 
     static_proxy.safe_psql(
         "create role pinocchio with login password 'magic'",
-        options=f"{option_name}=irrelevant",
     )
 
     # User exists, but password is missing
     with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", password=None, options=f"{option_name}=irrelevant")
+        static_proxy.connect(user="pinocchio", password=None)
     text = str(exprinfo.value).strip()
-    assert text.endswith("password authentication failed for user 'pinocchio'")
+    assert text.find("password authentication failed for user 'pinocchio'") != -1
 
     # User exists, but password is wrong
     with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", password="bad", options=f"{option_name}=irrelevant")
+        static_proxy.connect(user="pinocchio", password="bad")
     text = str(exprinfo.value).strip()
-    assert text.endswith("password authentication failed for user 'pinocchio'")
+    assert text.find("password authentication failed for user 'pinocchio'") != -1
 
     # Finally, check that the user can connect
-    with static_proxy.connect(
-        user="pinocchio", password="magic", options=f"{option_name}=irrelevant"
-    ):
+    with static_proxy.connect(user="pinocchio", password="magic"):
         pass
 
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
-def test_forward_params_to_client(static_proxy: NeonProxy, option_name: str):
+def test_forward_params_to_client(static_proxy: NeonProxy):
     """
     Check that we forward all necessary PostgreSQL server params to client.
     """
@@ -140,7 +155,7 @@ def test_forward_params_to_client(static_proxy: NeonProxy, option_name: str):
         where name = any(%s)
     """
 
-    with static_proxy.connect(options=f"{option_name}=irrelevant") as conn:
+    with static_proxy.connect() as conn:
         with conn.cursor() as cur:
             cur.execute(query, (reported_params_subset,))
             for name, value in cur.fetchall():
@@ -148,18 +163,65 @@ def test_forward_params_to_client(static_proxy: NeonProxy, option_name: str):
                 assert conn.get_parameter_status(name) == value
 
 
-@pytest.mark.parametrize("option_name", ["project", "endpoint"])
 @pytest.mark.timeout(5)
-def test_close_on_connections_exit(static_proxy: NeonProxy, option_name: str):
+def test_close_on_connections_exit(static_proxy: NeonProxy):
     # Open two connections, send SIGTERM, then ensure that proxy doesn't exit
     # until after connections close.
-    with static_proxy.connect(options=f"{option_name}=irrelevant"), static_proxy.connect(
-        options=f"{option_name}=irrelevant"
-    ):
+    with static_proxy.connect(), static_proxy.connect():
         static_proxy.terminate()
         with pytest.raises(subprocess.TimeoutExpired):
             static_proxy.wait_for_exit(timeout=2)
         # Ensure we don't accept any more connections
         with pytest.raises(psycopg2.OperationalError):
-            static_proxy.connect(options=f"{option_name}=irrelevant")
+            static_proxy.connect()
     static_proxy.wait_for_exit()
+
+
+def test_sql_over_http(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http with login password 'http' superuser")
+
+    def q(sql: str, params: List[Any] = []) -> Any:
+        connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
+        response = requests.post(
+            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+            data=json.dumps({"query": sql, "params": params}),
+            headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr},
+            verify=str(static_proxy.test_output_dir / "proxy.crt"),
+        )
+        assert response.status_code == 200
+        return response.json()
+
+    rows = q("select 42 as answer")["rows"]
+    assert rows == [{"answer": 42}]
+
+    rows = q("select $1 as answer", [42])["rows"]
+    assert rows == [{"answer": "42"}]
+
+    rows = q("select $1 * 1 as answer", [42])["rows"]
+    assert rows == [{"answer": 42}]
+
+    rows = q("select $1::int[] as answer", [[1, 2, 3]])["rows"]
+    assert rows == [{"answer": [1, 2, 3]}]
+
+    rows = q("select $1::json->'a' as answer", [{"a": {"b": 42}}])["rows"]
+    assert rows == [{"answer": {"b": 42}}]
+
+    rows = q("select * from pg_class limit 1")["rows"]
+    assert len(rows) == 1
+
+    res = q("create table t(id serial primary key, val int)")
+    assert res["command"] == "CREATE"
+    assert res["rowCount"] is None
+
+    res = q("insert into t(val) values (10), (20), (30) returning id")
+    assert res["command"] == "INSERT"
+    assert res["rowCount"] == 3
+    assert res["rows"] == [{"id": 1}, {"id": 2}, {"id": 3}]
+
+    res = q("select * from t")
+    assert res["command"] == "SELECT"
+    assert res["rowCount"] == 3
+
+    res = q("drop table t")
+    assert res["command"] == "DROP"
+    assert res["rowCount"] is None

From a475cdf642df08501c3c2035bcee454da68a3dfa Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 13 Jun 2023 13:34:56 +0200
Subject: [PATCH 055/209] [compute_ctl] Fix logging if catalog updates are
 skipped (#4480)

Otherwise, it wasn't clear from the log when Postgres started up
completely if catalog updates were skipped.

Follow-up for 4936ab6
---
 compute_tools/src/compute.rs | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 977708a18f..94cebf93de 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -370,11 +370,6 @@ impl ComputeNode {
         // 'Close' connection
         drop(client);
 
-        info!(
-            "finished configuration of compute for project {}",
-            spec.cluster.cluster_id.as_deref().unwrap_or("None")
-        );
-
         Ok(())
     }
 
@@ -427,22 +422,22 @@ impl ComputeNode {
     #[instrument(skip(self))]
     pub fn start_compute(&self) -> Result<std::process::Child> {
         let compute_state = self.state.lock().unwrap().clone();
-        let spec = compute_state.pspec.as_ref().expect("spec must be set");
+        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
         info!(
             "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            spec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
-            spec.spec.operation_uuid.as_deref().unwrap_or("None"),
-            spec.tenant_id,
-            spec.timeline_id,
+            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
+            pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
+            pspec.tenant_id,
+            pspec.timeline_id,
         );
 
         self.prepare_pgdata(&compute_state)?;
 
         let start_time = Utc::now();
 
-        let pg = self.start_postgres(spec.storage_auth_token.clone())?;
+        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
 
-        if spec.spec.mode == ComputeMode::Primary && !spec.spec.skip_pg_catalog_updates {
+        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
             self.apply_config(&compute_state)?;
         }
 
@@ -462,6 +457,11 @@ impl ComputeNode {
         }
         self.set_status(ComputeStatus::Running);
 
+        info!(
+            "finished configuration of compute for project {}",
+            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
+        );
+
         Ok(pg)
     }
 

From e437787c8fdefc9859a37c69c31d0bf336f84aa9 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 21 Jun 2023 15:50:52 +0300
Subject: [PATCH 056/209] cargo update -p openssl (#4542)

To unblock release
https://github.com/neondatabase/neon/pull/4536#issuecomment-1600678054

Context: https://rustsec.org/advisories/RUSTSEC-2023-0044
---
 Cargo.lock | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 71a6699c50..4be74614c2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2349,9 +2349,9 @@ checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575"
 
 [[package]]
 name = "openssl"
-version = "0.10.52"
+version = "0.10.55"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01b8574602df80f7b85fdfc5392fa884a4e3b3f4f35402c070ab34c3d3f78d56"
+checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d"
 dependencies = [
  "bitflags",
  "cfg-if",
@@ -2381,9 +2381,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.87"
+version = "0.9.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8e17f59264b2809d77ae94f0e1ebabc434773f370d6ca667bd223ea10e06cc7e"
+checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6"
 dependencies = [
  "cc",
  "libc",

From 6bc756129065cb3d2e1be92289fce749e2920891 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <me@cschwarz.com>
Date: Fri, 23 Jun 2023 20:43:20 +0200
Subject: [PATCH 057/209] don't use MGMT_REQUEST_RUNTIME for consumption
 metrics synthetic size worker

The consumption metrics synthetic size worker does logical size calculation.
Logical size calculation currently does synchronous disk IO.
This blocks the MGMT_REQUEST_RUNTIME's executor threads, starving other futures.

While there's work on the way to move the synchronous disk IO into spawn_blocking,
the quickfix here is to use the BACKGROUND_RUNTIME instead of MGMT_REQUEST_RUNTIME.

Actually it's not just a quickfix. We simply shouldn't be blocking MGMT_REQUEST_RUNTIME
executor threads on CPU or sync disk IO.
That work isn't done yet, as many of the mgmt tasks still _do_ disk IO.
But it's not as intensive as the logical size calculations that we're fixing here.

While we're at it, fix disk-usage-based eviction in a similar way.
It wasn't the culprit here, according to prod logs, but it can theoretically be
a little CPU-intensive.

More context, including graphs from Prod:
https://neondb.slack.com/archives/C03F5SM1N02/p1687541681336949

(cherry picked from commit d6e35222ea592428b78401ff0053b51424674e03)
---
 pageserver/src/bin/pageserver.rs | 82 ++++++++++++++++----------------
 pageserver/src/http/routes.rs    |  4 +-
 2 files changed, 42 insertions(+), 44 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 1fa5e4ab3b..b01ace63e4 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -495,50 +495,50 @@ fn start_pageserver(
                 Ok(())
             },
         );
+    }
 
-        if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
-            let background_jobs_barrier = background_jobs_barrier;
-            let metrics_ctx = RequestContext::todo_child(
-                TaskKind::MetricsCollection,
-                // This task itself shouldn't download anything.
-                // The actual size calculation does need downloads, and
-                // creates a child context with the right DownloadBehavior.
-                DownloadBehavior::Error,
-            );
-            task_mgr::spawn(
-                MGMT_REQUEST_RUNTIME.handle(),
-                TaskKind::MetricsCollection,
-                None,
-                None,
-                "consumption metrics collection",
-                true,
-                async move {
-                    // first wait until background jobs are cleared to launch.
-                    //
-                    // this is because we only process active tenants and timelines, and the
-                    // Timeline::get_current_logical_size will spawn the logical size calculation,
-                    // which will not be rate-limited.
-                    let cancel = task_mgr::shutdown_token();
+    if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
+        let background_jobs_barrier = background_jobs_barrier;
+        let metrics_ctx = RequestContext::todo_child(
+            TaskKind::MetricsCollection,
+            // This task itself shouldn't download anything.
+            // The actual size calculation does need downloads, and
+            // creates a child context with the right DownloadBehavior.
+            DownloadBehavior::Error,
+        );
+        task_mgr::spawn(
+            crate::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MetricsCollection,
+            None,
+            None,
+            "consumption metrics collection",
+            true,
+            async move {
+                // first wait until background jobs are cleared to launch.
+                //
+                // this is because we only process active tenants and timelines, and the
+                // Timeline::get_current_logical_size will spawn the logical size calculation,
+                // which will not be rate-limited.
+                let cancel = task_mgr::shutdown_token();
 
-                    tokio::select! {
-                        _ = cancel.cancelled() => { return Ok(()); },
-                        _ = background_jobs_barrier.wait() => {}
-                    };
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()); },
+                    _ = background_jobs_barrier.wait() => {}
+                };
 
-                    pageserver::consumption_metrics::collect_metrics(
-                        metric_collection_endpoint,
-                        conf.metric_collection_interval,
-                        conf.cached_metric_collection_interval,
-                        conf.synthetic_size_calculation_interval,
-                        conf.id,
-                        metrics_ctx,
-                    )
-                    .instrument(info_span!("metrics_collection"))
-                    .await?;
-                    Ok(())
-                },
-            );
-        }
+                pageserver::consumption_metrics::collect_metrics(
+                    metric_collection_endpoint,
+                    conf.metric_collection_interval,
+                    conf.cached_metric_collection_interval,
+                    conf.synthetic_size_calculation_interval,
+                    conf.id,
+                    metrics_ctx,
+                )
+                .instrument(info_span!("metrics_collection"))
+                .await?;
+                Ok(())
+            },
+        );
     }
 
     // Spawn a task to listen for libpq connections. It will spawn further tasks
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index fc8da70cc0..0a55741f84 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1128,8 +1128,6 @@ async fn disk_usage_eviction_run(
         freed_bytes: 0,
     };
 
-    use crate::task_mgr::MGMT_REQUEST_RUNTIME;
-
     let (tx, rx) = tokio::sync::oneshot::channel();
 
     let state = get_state(&r);
@@ -1147,7 +1145,7 @@ async fn disk_usage_eviction_run(
     let _g = cancel.drop_guard();
 
     crate::task_mgr::spawn(
-        MGMT_REQUEST_RUNTIME.handle(),
+        crate::task_mgr::BACKGROUND_RUNTIME.handle(),
         TaskKind::DiskUsageEviction,
         None,
         None,

From feff887c6f5418316870533435564adbaacf8195 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 5 Jul 2023 18:40:25 +0100
Subject: [PATCH 058/209] Compile `pg_embedding` extension (#4634)

```
CREATE EXTENSION embedding;
CREATE TABLE t (val real[]);
INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL);
CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3);
INSERT INTO t (val) VALUES (array[1,2,4]);

SELECT * FROM t ORDER BY val <-> array[3,3,3];
   val
---------
 {1,2,3}
 {1,2,4}
 {1,1,1}
 {0,0,0}

(5 rows)
```
---
 Dockerfile.compute-node | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 310e4c32a3..7208024d63 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -515,6 +515,25 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control
 
+#########################################################################################
+#
+# Layer "pg-embedding-pg-build"
+# compile pg_embedding extension
+#
+#########################################################################################
+FROM build-deps AS pg-embedding-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+# 2465f831ea1f8d49c1d74f8959adb7fc277d70cd made on 05/07/2023
+# There is no release tag yet
+RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1d74f8959adb7fc277d70cd.tar.gz -O pg_embedding.tar.gz && \
+    echo "047af2b1f664a1e6e37867bd4eeaf5934fa27d6ba3d6c4461efa388ddf7cd1d5 pg_embedding.tar.gz" | sha256sum --check && \
+    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/embedding.control
+
 #########################################################################################
 #
 # Layer "pg-anon-pg-build"
@@ -671,6 +690,7 @@ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \

From efa6aa134f4190d4ad9c10eaa55b17abd79b9457 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 7 Jul 2023 17:50:50 +0100
Subject: [PATCH 059/209] allow repeated IO errors from compute node (#4624)

## Problem

#4598 compute nodes are not accessible some time after wake up due to
kubernetes DNS not being fully propagated.

## Summary of changes

Update connect retry mechanism to support handling IO errors and
sleeping for 100ms

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
---
 proxy/src/http/conn_pool.rs | 69 +++++++++++++++++-----------
 proxy/src/proxy.rs          | 92 ++++++++++++++++++++++++++++++-------
 proxy/src/proxy/tests.rs    | 17 +++++++
 3 files changed, 136 insertions(+), 42 deletions(-)

diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs
index 52c1e2f2ce..27950d3a20 100644
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -2,16 +2,15 @@ use parking_lot::Mutex;
 use pq_proto::StartupMessageParams;
 use std::fmt;
 use std::{collections::HashMap, sync::Arc};
-
-use futures::TryFutureExt;
+use tokio::time;
 
 use crate::config;
 use crate::{auth, console};
 
 use super::sql_over_http::MAX_RESPONSE_SIZE;
 
-use crate::proxy::invalidate_cache;
-use crate::proxy::NUM_RETRIES_WAKE_COMPUTE;
+use crate::proxy::try_wake;
+use crate::proxy::{BASE_RETRY_WAIT_DURATION, NUM_RETRIES_WAKE_COMPUTE};
 
 use tracing::error;
 use tracing::info;
@@ -223,32 +222,59 @@ async fn connect_to_compute(
 
     // This code is a copy of `connect_to_compute` from `src/proxy.rs` with
     // the difference that it uses `tokio_postgres` for the connection.
-    let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
+    let mut num_retries = 0;
+    let mut should_wake = true;
     loop {
         match connect_to_compute_once(node_info, conn_info).await {
-            Err(e) if num_retries > 0 => {
-                info!("compute node's state has changed; requesting a wake-up");
-                match creds.wake_compute(&extra).await? {
-                    // Update `node_info` and try one more time.
-                    Some(new) => {
-                        *node_info = new;
+            Err(e) if num_retries == NUM_RETRIES_WAKE_COMPUTE => {
+                if let Some(wait_duration) = retry_connect_in(&e, num_retries) {
+                    error!(error = ?e, "could not connect to compute node");
+                    if should_wake {
+                        match try_wake(node_info, &extra, &creds).await {
+                            Ok(Some(x)) => should_wake = x,
+                            Ok(None) => return Err(e.into()),
+                            Err(e) => return Err(e.into()),
+                        }
                     }
-                    // Link auth doesn't work that way, so we just exit.
-                    None => return Err(e),
+                    if !wait_duration.is_zero() {
+                        time::sleep(wait_duration).await;
+                    }
+                } else {
+                    return Err(e.into());
                 }
             }
-            other => return other,
+            other => return Ok(other?),
         }
 
-        num_retries -= 1;
-        info!("retrying after wake-up ({num_retries} attempts left)");
+        num_retries += 1;
+        info!(retries_left = num_retries, "retrying connect");
+    }
+}
+
+fn retry_connect_in(err: &tokio_postgres::Error, num_retries: u32) -> Option<time::Duration> {
+    use tokio_postgres::error::SqlState;
+    match err.code() {
+        // retry all errors at least once immediately
+        _ if num_retries == 0 => Some(time::Duration::ZERO),
+        // keep retrying connection errors every 100ms
+        Some(
+            &SqlState::CONNECTION_FAILURE
+            | &SqlState::CONNECTION_EXCEPTION
+            | &SqlState::CONNECTION_DOES_NOT_EXIST
+            | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
+        ) => {
+            // 3/2 = 1.5 which seems to be an ok growth factor heuristic
+            Some(BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries))
+        }
+        // otherwise, don't retry
+        _ => None,
     }
 }
 
 async fn connect_to_compute_once(
     node_info: &console::CachedNodeInfo,
     conn_info: &ConnInfo,
-) -> anyhow::Result<tokio_postgres::Client> {
+) -> Result<tokio_postgres::Client, tokio_postgres::Error> {
     let mut config = (*node_info.config).clone();
 
     let (client, connection) = config
@@ -257,15 +283,6 @@ async fn connect_to_compute_once(
         .dbname(&conn_info.dbname)
         .max_backend_message_size(MAX_RESPONSE_SIZE)
         .connect(tokio_postgres::NoTls)
-        .inspect_err(|e: &tokio_postgres::Error| {
-            error!(
-                "failed to connect to compute node hosts={:?} ports={:?}: {}",
-                node_info.config.get_hosts(),
-                node_info.config.get_ports(),
-                e
-            );
-            invalidate_cache(node_info)
-        })
         .await?;
 
     tokio::spawn(async move {
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 2433412c4c..5c5353a63e 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -6,12 +6,17 @@ use crate::{
     cancellation::{self, CancelMap},
     compute::{self, PostgresConnection},
     config::{ProxyConfig, TlsConfig},
-    console::{self, messages::MetricsAuxInfo},
+    console::{
+        self,
+        errors::{ApiError, WakeComputeError},
+        messages::MetricsAuxInfo,
+    },
     error::io_error,
     stream::{PqStream, Stream},
 };
 use anyhow::{bail, Context};
 use futures::TryFutureExt;
+use hyper::StatusCode;
 use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
@@ -25,7 +30,9 @@ use tracing::{error, info, warn};
 use utils::measured_stream::MeasuredStream;
 
 /// Number of times we should retry the `/proxy_wake_compute` http request.
-pub const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
+/// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
+pub const NUM_RETRIES_WAKE_COMPUTE: u32 = 10;
+pub const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
 
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";
@@ -315,7 +322,6 @@ async fn connect_to_compute_once(
     node_info
         .config
         .connect(allow_self_signed_compute, timeout)
-        .inspect_err(|_: &compute::ConnectionError| invalidate_cache(node_info))
         .await
 }
 
@@ -328,7 +334,8 @@ async fn connect_to_compute(
     extra: &console::ConsoleReqExtra<'_>,
     creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
 ) -> Result<PostgresConnection, compute::ConnectionError> {
-    let mut num_retries: usize = NUM_RETRIES_WAKE_COMPUTE;
+    let mut num_retries = 0;
+    let mut should_wake = true;
     loop {
         // Apply startup params to the (possibly, cached) compute node info.
         node_info.config.set_startup_params(params);
@@ -346,30 +353,83 @@ async fn connect_to_compute(
         // We only use caching in case of scram proxy backed by the console, so reduce
         // the timeout only in that case.
         let is_scram_proxy = matches!(creds, auth::BackendType::Console(_, _));
-        let timeout = if is_scram_proxy && num_retries == NUM_RETRIES_WAKE_COMPUTE {
+        let timeout = if is_scram_proxy && num_retries == 0 {
             time::Duration::from_secs(2)
         } else {
             time::Duration::from_secs(10)
         };
 
         match connect_to_compute_once(node_info, timeout).await {
-            Err(e) if num_retries > 0 => {
-                info!("compute node's state has changed; requesting a wake-up");
-                match creds.wake_compute(extra).map_err(io_error).await? {
-                    // Update `node_info` and try one more time.
-                    Some(mut new) => {
-                        new.config.reuse_password(&node_info.config);
-                        *node_info = new;
+            Err(e) if num_retries < NUM_RETRIES_WAKE_COMPUTE => {
+                if let Some(wait_duration) = retry_connect_in(&e, num_retries) {
+                    error!(error = ?e, "could not connect to compute node");
+                    if should_wake {
+                        match try_wake(node_info, extra, creds).await {
+                            Ok(Some(x)) => {
+                                should_wake = x;
+                            }
+                            Ok(None) => return Err(e),
+                            Err(e) => return Err(io_error(e).into()),
+                        }
                     }
-                    // Link auth doesn't work that way, so we just exit.
-                    None => return Err(e),
+                    if !wait_duration.is_zero() {
+                        time::sleep(wait_duration).await;
+                    }
+                } else {
+                    return Err(e);
                 }
             }
             other => return other,
         }
 
-        num_retries -= 1;
-        info!("retrying after wake-up ({num_retries} attempts left)");
+        num_retries += 1;
+        info!(retries_left = num_retries, "retrying connect");
+    }
+}
+
+/// Attempts to wake up the compute node.
+/// * Returns Ok(Some(true)) if there was an error waking but retries are acceptable
+/// * Returns Ok(Some(false)) if the wakeup succeeded
+/// * Returns Ok(None) or Err(e) if there was an error
+pub async fn try_wake(
+    node_info: &mut console::CachedNodeInfo,
+    extra: &console::ConsoleReqExtra<'_>,
+    creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
+) -> Result<Option<bool>, WakeComputeError> {
+    info!("compute node's state has likely changed; requesting a wake-up");
+    invalidate_cache(node_info);
+    match creds.wake_compute(extra).await {
+        // retry wake if the compute was in an invalid state
+        Err(WakeComputeError::ApiError(ApiError::Console {
+            status: StatusCode::BAD_REQUEST,
+            ..
+        })) => Ok(Some(true)),
+        // Update `node_info` and try again.
+        Ok(Some(mut new)) => {
+            new.config.reuse_password(&node_info.config);
+            *node_info = new;
+            Ok(Some(false))
+        }
+        Err(e) => Err(e),
+        Ok(None) => Ok(None),
+    }
+}
+
+fn retry_connect_in(err: &compute::ConnectionError, num_retries: u32) -> Option<time::Duration> {
+    use std::io::ErrorKind;
+    match err {
+        // retry all errors at least once immediately
+        _ if num_retries == 0 => Some(time::Duration::ZERO),
+        // keep retrying connection errors every 100ms
+        compute::ConnectionError::CouldNotConnect(io_err) => match io_err.kind() {
+            ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable => {
+                // 3/2 = 1.5 which seems to be an ok growth factor heuristic
+                Some(BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries))
+            }
+            _ => None,
+        },
+        // otherwise, don't retry
+        _ => None,
     }
 }
 
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 3373c49676..a1f6cd3ed4 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -1,4 +1,6 @@
 //! A group of high-level tests for connection establishing logic and auth.
+use std::io;
+
 use super::*;
 use crate::{auth, sasl, scram};
 use async_trait::async_trait;
@@ -294,3 +296,18 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 
     Ok(())
 }
+
+#[test]
+fn connect_compute_total_wait() {
+    let err = compute::ConnectionError::CouldNotConnect(io::Error::new(
+        io::ErrorKind::ConnectionRefused,
+        "conn refused",
+    ));
+
+    let mut total_wait = tokio::time::Duration::ZERO;
+    for num_retries in 0..10 {
+        total_wait += retry_connect_in(&err, num_retries).unwrap();
+    }
+    assert!(total_wait < tokio::time::Duration::from_secs(12));
+    assert!(total_wait > tokio::time::Duration::from_secs(10));
+}

From 39a28d11083eefc72c5f302c21e0e74742129e14 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 12 Jul 2023 11:38:36 +0100
Subject: [PATCH 060/209] proxy wake_compute loop (#4675)

## Problem

If we fail to wake up the compute node, a subsequent connect attempt
will definitely fail. However, kubernetes won't fail the connection
immediately, instead it hangs until we timeout (10s).

## Summary of changes

Refactor the loop to allow fast retries of compute_wake and to skip a
connect attempt.
---
 proxy/src/http/conn_pool.rs |  81 ++++++++++++++++-----------
 proxy/src/proxy.rs          | 108 ++++++++++++++++++++++++------------
 proxy/src/proxy/tests.rs    |   9 +--
 3 files changed, 121 insertions(+), 77 deletions(-)

diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs
index 27950d3a20..fb53c663c8 100644
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -1,6 +1,7 @@
 use parking_lot::Mutex;
 use pq_proto::StartupMessageParams;
 use std::fmt;
+use std::ops::ControlFlow;
 use std::{collections::HashMap, sync::Arc};
 use tokio::time;
 
@@ -9,8 +10,7 @@ use crate::{auth, console};
 
 use super::sql_over_http::MAX_RESPONSE_SIZE;
 
-use crate::proxy::try_wake;
-use crate::proxy::{BASE_RETRY_WAIT_DURATION, NUM_RETRIES_WAKE_COMPUTE};
+use crate::proxy::{invalidate_cache, retry_after, try_wake, NUM_RETRIES_WAKE_COMPUTE};
 
 use tracing::error;
 use tracing::info;
@@ -184,11 +184,10 @@ impl GlobalConnPool {
     }
 }
 
-//
 // Wake up the destination if needed. Code here is a bit involved because
 // we reuse the code from the usual proxy and we need to prepare few structures
 // that this code expects.
-//
+#[tracing::instrument(skip_all)]
 async fn connect_to_compute(
     config: &config::ProxyConfig,
     conn_info: &ConnInfo,
@@ -220,54 +219,72 @@ async fn connect_to_compute(
 
     let node_info = &mut creds.wake_compute(&extra).await?.expect("msg");
 
-    // This code is a copy of `connect_to_compute` from `src/proxy.rs` with
-    // the difference that it uses `tokio_postgres` for the connection.
     let mut num_retries = 0;
-    let mut should_wake = true;
+    let mut wait_duration = time::Duration::ZERO;
+    let mut should_wake_with_error = None;
     loop {
+        if !wait_duration.is_zero() {
+            time::sleep(wait_duration).await;
+        }
+
+        // try wake the compute node if we have determined it's sensible to do so
+        if let Some(err) = should_wake_with_error.take() {
+            match try_wake(node_info, &extra, &creds).await {
+                // we can't wake up the compute node
+                Ok(None) => return Err(err),
+                // there was an error communicating with the control plane
+                Err(e) => return Err(e.into()),
+                // failed to wake up but we can continue to retry
+                Ok(Some(ControlFlow::Continue(()))) => {
+                    wait_duration = retry_after(num_retries);
+                    should_wake_with_error = Some(err);
+
+                    num_retries += 1;
+                    info!(num_retries, "retrying wake compute");
+                    continue;
+                }
+                // successfully woke up a compute node and can break the wakeup loop
+                Ok(Some(ControlFlow::Break(()))) => {}
+            }
+        }
+
         match connect_to_compute_once(node_info, conn_info).await {
-            Err(e) if num_retries == NUM_RETRIES_WAKE_COMPUTE => {
-                if let Some(wait_duration) = retry_connect_in(&e, num_retries) {
-                    error!(error = ?e, "could not connect to compute node");
-                    if should_wake {
-                        match try_wake(node_info, &extra, &creds).await {
-                            Ok(Some(x)) => should_wake = x,
-                            Ok(None) => return Err(e.into()),
-                            Err(e) => return Err(e.into()),
-                        }
-                    }
-                    if !wait_duration.is_zero() {
-                        time::sleep(wait_duration).await;
-                    }
-                } else {
+            Ok(res) => return Ok(res),
+            Err(e) => {
+                error!(error = ?e, "could not connect to compute node");
+                if !can_retry_error(&e, num_retries) {
                     return Err(e.into());
                 }
+                wait_duration = retry_after(num_retries);
+
+                // after the first connect failure,
+                // we should invalidate the cache and wake up a new compute node
+                if num_retries == 0 {
+                    invalidate_cache(node_info);
+                    should_wake_with_error = Some(e.into());
+                }
             }
-            other => return Ok(other?),
         }
 
         num_retries += 1;
-        info!(retries_left = num_retries, "retrying connect");
+        info!(num_retries, "retrying connect");
     }
 }
 
-fn retry_connect_in(err: &tokio_postgres::Error, num_retries: u32) -> Option<time::Duration> {
+fn can_retry_error(err: &tokio_postgres::Error, num_retries: u32) -> bool {
     use tokio_postgres::error::SqlState;
     match err.code() {
-        // retry all errors at least once immediately
-        _ if num_retries == 0 => Some(time::Duration::ZERO),
-        // keep retrying connection errors every 100ms
+        // retry all errors at least once
+        _ if num_retries == 0 => true,
+        // keep retrying connection errors
         Some(
             &SqlState::CONNECTION_FAILURE
             | &SqlState::CONNECTION_EXCEPTION
             | &SqlState::CONNECTION_DOES_NOT_EXIST
             | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION,
-        ) => {
-            // 3/2 = 1.5 which seems to be an ok growth factor heuristic
-            Some(BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries))
-        }
+        ) if num_retries < NUM_RETRIES_WAKE_COMPUTE => true,
         // otherwise, don't retry
-        _ => None,
+        _ => false,
     }
 }
 
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 5c5353a63e..12ca9c5187 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -20,7 +20,7 @@ use hyper::StatusCode;
 use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
-use std::sync::Arc;
+use std::{ops::ControlFlow, sync::Arc};
 use tokio::{
     io::{AsyncRead, AsyncWrite, AsyncWriteExt},
     time,
@@ -32,7 +32,7 @@ use utils::measured_stream::MeasuredStream;
 /// Number of times we should retry the `/proxy_wake_compute` http request.
 /// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
 pub const NUM_RETRIES_WAKE_COMPUTE: u32 = 10;
-pub const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
+const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
 
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";
@@ -335,11 +335,37 @@ async fn connect_to_compute(
     creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
 ) -> Result<PostgresConnection, compute::ConnectionError> {
     let mut num_retries = 0;
-    let mut should_wake = true;
+    let mut wait_duration = time::Duration::ZERO;
+    let mut should_wake_with_error = None;
     loop {
         // Apply startup params to the (possibly, cached) compute node info.
         node_info.config.set_startup_params(params);
 
+        if !wait_duration.is_zero() {
+            time::sleep(wait_duration).await;
+        }
+
+        // try wake the compute node if we have determined it's sensible to do so
+        if let Some(err) = should_wake_with_error.take() {
+            match try_wake(node_info, extra, creds).await {
+                // we can't wake up the compute node
+                Ok(None) => return Err(err),
+                // there was an error communicating with the control plane
+                Err(e) => return Err(io_error(e).into()),
+                // failed to wake up but we can continue to retry
+                Ok(Some(ControlFlow::Continue(()))) => {
+                    wait_duration = retry_after(num_retries);
+                    should_wake_with_error = Some(err);
+
+                    num_retries += 1;
+                    info!(num_retries, "retrying wake compute");
+                    continue;
+                }
+                // successfully woke up a compute node and can break the wakeup loop
+                Ok(Some(ControlFlow::Break(()))) => {}
+            }
+        }
+
         // Set a shorter timeout for the initial connection attempt.
         //
         // In case we try to connect to an outdated address that is no longer valid, the
@@ -359,31 +385,29 @@ async fn connect_to_compute(
             time::Duration::from_secs(10)
         };
 
+        // do this again to ensure we have username?
+        node_info.config.set_startup_params(params);
+
         match connect_to_compute_once(node_info, timeout).await {
-            Err(e) if num_retries < NUM_RETRIES_WAKE_COMPUTE => {
-                if let Some(wait_duration) = retry_connect_in(&e, num_retries) {
-                    error!(error = ?e, "could not connect to compute node");
-                    if should_wake {
-                        match try_wake(node_info, extra, creds).await {
-                            Ok(Some(x)) => {
-                                should_wake = x;
-                            }
-                            Ok(None) => return Err(e),
-                            Err(e) => return Err(io_error(e).into()),
-                        }
-                    }
-                    if !wait_duration.is_zero() {
-                        time::sleep(wait_duration).await;
-                    }
-                } else {
+            Ok(res) => return Ok(res),
+            Err(e) => {
+                error!(error = ?e, "could not connect to compute node");
+                if !can_retry_error(&e, num_retries) {
                     return Err(e);
                 }
+                wait_duration = retry_after(num_retries);
+
+                // after the first connect failure,
+                // we should invalidate the cache and wake up a new compute node
+                if num_retries == 0 {
+                    invalidate_cache(node_info);
+                    should_wake_with_error = Some(e);
+                }
             }
-            other => return other,
         }
 
         num_retries += 1;
-        info!(retries_left = num_retries, "retrying connect");
+        info!(num_retries, "retrying connect");
     }
 }
 
@@ -395,41 +419,51 @@ pub async fn try_wake(
     node_info: &mut console::CachedNodeInfo,
     extra: &console::ConsoleReqExtra<'_>,
     creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
-) -> Result<Option<bool>, WakeComputeError> {
+) -> Result<Option<ControlFlow<()>>, WakeComputeError> {
     info!("compute node's state has likely changed; requesting a wake-up");
-    invalidate_cache(node_info);
     match creds.wake_compute(extra).await {
         // retry wake if the compute was in an invalid state
         Err(WakeComputeError::ApiError(ApiError::Console {
             status: StatusCode::BAD_REQUEST,
             ..
-        })) => Ok(Some(true)),
+        })) => Ok(Some(ControlFlow::Continue(()))),
         // Update `node_info` and try again.
         Ok(Some(mut new)) => {
             new.config.reuse_password(&node_info.config);
             *node_info = new;
-            Ok(Some(false))
+            Ok(Some(ControlFlow::Break(())))
         }
         Err(e) => Err(e),
         Ok(None) => Ok(None),
     }
 }
 
-fn retry_connect_in(err: &compute::ConnectionError, num_retries: u32) -> Option<time::Duration> {
+fn can_retry_error(err: &compute::ConnectionError, num_retries: u32) -> bool {
     use std::io::ErrorKind;
     match err {
-        // retry all errors at least once immediately
-        _ if num_retries == 0 => Some(time::Duration::ZERO),
-        // keep retrying connection errors every 100ms
-        compute::ConnectionError::CouldNotConnect(io_err) => match io_err.kind() {
-            ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable => {
-                // 3/2 = 1.5 which seems to be an ok growth factor heuristic
-                Some(BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries))
-            }
-            _ => None,
-        },
+        // retry all errors at least once
+        _ if num_retries == 0 => true,
+        // keep retrying connection errors
+        compute::ConnectionError::CouldNotConnect(io_err)
+            if num_retries < NUM_RETRIES_WAKE_COMPUTE =>
+        {
+            matches!(
+                io_err.kind(),
+                ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable
+            )
+        }
         // otherwise, don't retry
-        _ => None,
+        _ => false,
+    }
+}
+
+pub fn retry_after(num_retries: u32) -> time::Duration {
+    match num_retries {
+        0 => time::Duration::ZERO,
+        _ => {
+            // 3/2 = 1.5 which seems to be an ok growth factor heuristic
+            BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries)
+        }
     }
 }
 
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index a1f6cd3ed4..b9215cd90e 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -1,6 +1,4 @@
 //! A group of high-level tests for connection establishing logic and auth.
-use std::io;
-
 use super::*;
 use crate::{auth, sasl, scram};
 use async_trait::async_trait;
@@ -299,14 +297,9 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 
 #[test]
 fn connect_compute_total_wait() {
-    let err = compute::ConnectionError::CouldNotConnect(io::Error::new(
-        io::ErrorKind::ConnectionRefused,
-        "conn refused",
-    ));
-
     let mut total_wait = tokio::time::Duration::ZERO;
     for num_retries in 0..10 {
-        total_wait += retry_connect_in(&err, num_retries).unwrap();
+        total_wait += retry_after(num_retries);
     }
     assert!(total_wait < tokio::time::Duration::from_secs(12));
     assert!(total_wait > tokio::time::Duration::from_secs(10));

From 4204960942fc456906396072c63b373f600b89e5 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <me@cschwarz.com>
Date: Tue, 1 Aug 2023 11:52:35 +0200
Subject: [PATCH 061/209] ci: fix upload-postgres-extensions-to-s3 job

commit

	commit 5f8fd640bfa8e5d4d23dbc3df1b0a521ec666e56
	Author: Alek Westover <alek.westover@gmail.com>
	Date:   Wed Jul 26 08:24:03 2023 -0400

	    Upload Test Remote Extensions (#4792)

switched to using the release tag instead of `latest`, but,
the `promote-images` job only uploads `latest` to the prod ECR.

The switch to using release tag was good in principle, but,
reverting that part to make the release pipeine work.

Note that a proper fix should abandon use of `:latest` tag
at all: currently, if a `main` pipeline runs concurrently
with a `release` pipeline, the `release` pipeline may end
up using the `main` pipeline's images.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 27bad61639..dcdc388a93 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -955,7 +955,7 @@ jobs:
         version: [ v14, v15 ]
 
     env:
-      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
+      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
       AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
       AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
       S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}

From 6eae4fc9aa9d143f6f2000fa2d761ccccf079710 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 3 Aug 2023 07:48:09 +0100
Subject: [PATCH 062/209] Release 2023-08-02: update pg_embedding  (#4877)

Cherry-picking ca4d71a9549be5638d84e6a9ff5c66ca14cbb05d from `main` into
the `release`

Co-authored-by: Vadim Kharitonov <vadim2404@users.noreply.github.com>
---
 Dockerfile.compute-node | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 9759faf733..b2d096dc43 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -551,8 +551,8 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.1.tar.gz -O pg_embedding.tar.gz && \
-    echo "c4ae84eef36fa8ec5868f6e061f39812f19ee5ba3604d428d40935685c7be512 pg_embedding.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.5.tar.gz -O pg_embedding.tar.gz && \
+    echo "0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 pg_embedding.tar.gz" | sha256sum --check && \
     mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \

From b1ddd01289f1092781d5603a88d0e75a9c23bc2b Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Thu, 3 Aug 2023 15:28:31 +0200
Subject: [PATCH 063/209] Define NEON_SMGR to make it possible for extensions
 to use Neon SMG API (#4889)

Co-authored-by: Konstantin Knizhnik <knizhnik@garret.ru>
Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/revisions.json | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index ebedb34d01..da3885c34d 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit ebedb34d01c8ac9c31e8ea4628b9854103a1dc8f
+Subproject commit da3885c34db312afd555802be2ce985fafd1d8ad
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 1220c8a63f..770c6dffc5 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 1220c8a63f00101829f9222a5821fc084b4384c7
+Subproject commit 770c6dffc5ef6aac05bf049693877fb377eea6fc
diff --git a/vendor/revisions.json b/vendor/revisions.json
index f5d7428cd9..8579b5abaa 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,4 +1,4 @@
 {
-    "postgres-v15": "1220c8a63f00101829f9222a5821fc084b4384c7",
-    "postgres-v14": "ebedb34d01c8ac9c31e8ea4628b9854103a1dc8f"
+    "postgres-v15": "770c6dffc5ef6aac05bf049693877fb377eea6fc",
+    "postgres-v14": "da3885c34db312afd555802be2ce985fafd1d8ad"
 }

From e78ac221077827a21f155ab7dc697b13159479d7 Mon Sep 17 00:00:00 2001
From: Felix Prasanna <91577249+fprasx@users.noreply.github.com>
Date: Tue, 8 Aug 2023 13:08:46 -0500
Subject: [PATCH 064/209] release fix: revert vm builder bump from 0.13.1 ->
 0.15.0-alpha1 (#4932)

This reverts commit 682dfb3a31ad13fd70b1d05fb099e4f54f93bf4e.

hotfix for a CLI arg issue in the monitor
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 16fc22ce62..dcdc388a93 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -794,7 +794,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.15.0-alpha1
+      VM_BUILDER_VERSION: v0.13.1
 
     steps:
       - name: Checkout

From d7d066d493074b1d68deaba59c15165f96ac7bda Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 8 Aug 2023 15:19:24 +0100
Subject: [PATCH 065/209] proxy: delay auth on retry (#4929)

## Problem

When an endpoint is shutting down, it can take a few seconds. Currently
when starting a new compute, this causes an "endpoint is in transition"
error. We need to add delays before retrying to ensure that we allow
time for the endpoint to shutdown properly.

## Summary of changes

Adds a delay before retrying in auth. connect_to_compute already has
this delay
---
 proxy/src/auth/backend/classic.rs | 7 +++++--
 proxy/src/proxy.rs                | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index d8ee1e8b64..46bd215f3b 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -5,7 +5,7 @@ use crate::{
     auth::{self, AuthFlow, ClientCredentials},
     compute,
     console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
-    proxy::handle_try_wake,
+    proxy::{handle_try_wake, retry_after},
     sasl, scram,
     stream::PqStream,
 };
@@ -62,10 +62,13 @@ pub(super) async fn authenticate(
             }
             Ok(ControlFlow::Continue(e)) => {
                 warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
-                num_retries += 1;
             }
             Ok(ControlFlow::Break(n)) => break n,
         }
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+        tokio::time::sleep(wait_duration).await;
     };
     if let Some(keys) = scram_keys {
         use tokio_postgres::config::AuthKeys;
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index e6fcf901d9..0267d767ee 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -545,7 +545,7 @@ impl ShouldRetry for compute::ConnectionError {
     }
 }
 
-fn retry_after(num_retries: u32) -> time::Duration {
+pub fn retry_after(num_retries: u32) -> time::Duration {
     // 1.5 seems to be an ok growth factor heuristic
     BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32))
 }

From 2f74287c9b24f2e575bd84fa8d06701478f1f26f Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 15 Aug 2023 15:31:48 +0000
Subject: [PATCH 066/209] Disable neon-pool-opt-in

---
 proxy/src/http/sql_over_http.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index aa06a91bba..09642b9a93 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -193,7 +193,7 @@ pub async fn handle(
     let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
 
     // Allow connection pooling only if explicitly requested
-    let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
+    let allow_pool = false;
 
     // isolation level and read only
 

From fec2ad6283184047cfb758880ca095fbd1493115 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Tue, 15 Aug 2023 15:45:03 +0000
Subject: [PATCH 067/209] Fix lint

---
 proxy/src/http/sql_over_http.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index 09642b9a93..c7fd7f6e39 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -44,7 +44,6 @@ const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
 
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
-static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
 static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level");
 static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only");
 

From 4e2e44e5240959ac48bb30571eee136c0226c989 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Tue, 22 Aug 2023 10:06:14 +0200
Subject: [PATCH 068/209] Enable neon-pool-opt-in (#5062)

---
 proxy/src/http/sql_over_http.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index 782e37641c..fe57096105 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -44,6 +44,7 @@ const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
 
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
+static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
 static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level");
 static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only");
 static TXN_DEFERRABLE: HeaderName = HeaderName::from_static("neon-batch-deferrable");
@@ -194,7 +195,7 @@ pub async fn handle(
     let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
 
     // Allow connection pooling only if explicitly requested
-    let allow_pool = false;
+    let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
 
     // isolation level, read only and deferrable
 

From 67db8432b481e585f654616f2260949aa137e9a2 Mon Sep 17 00:00:00 2001
From: Alek Westover <alek.westover@gmail.com>
Date: Tue, 22 Aug 2023 11:41:32 -0400
Subject: [PATCH 069/209] Fix cargo deny errors (#5068)

## Problem
cargo deny lint broken

Links to the CVEs:

[rustsec.org/advisories/RUSTSEC-2023-0052](https://rustsec.org/advisories/RUSTSEC-2023-0052)

[rustsec.org/advisories/RUSTSEC-2023-0053](https://rustsec.org/advisories/RUSTSEC-2023-0053)
One is fixed, the other one isn't so we allow it (for now), to unbreak
CI. Then later we'll try to get rid of webpki in favour of the rustls
fork.

## Summary of changes
```
+ignore = ["RUSTSEC-2023-0052"]
```
---
 Cargo.lock | 4 ++--
 deny.toml  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 23741e2a3d..81d2a04122 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3569,9 +3569,9 @@ dependencies = [
 
 [[package]]
 name = "rustls-webpki"
-version = "0.100.1"
+version = "0.100.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b"
+checksum = "e98ff011474fa39949b7e5c0428f9b4937eda7da7848bbb947786b7be0b27dab"
 dependencies = [
  "ring",
  "untrusted",
diff --git a/deny.toml b/deny.toml
index cfb699bb21..71006b14cd 100644
--- a/deny.toml
+++ b/deny.toml
@@ -18,7 +18,7 @@ vulnerability = "deny"
 unmaintained = "warn"
 yanked = "warn"
 notice = "warn"
-ignore = []
+ignore = ["RUSTSEC-2023-0052"]
 
 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:

From ce1753d036ea4d8be87e446ebe8c448e134862d1 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 25 Aug 2023 13:08:45 +0100
Subject: [PATCH 070/209] proxy: dont return connection pending (#5107)

## Problem

We were returning Pending when a connection had a notice/notification
(introduced recently in #5020). When returning pending, the runtime
assumes you will call `cx.waker().wake()` in order to continue
processing.

We weren't doing that, so the connection task would get stuck

## Summary of changes

Don't return pending. Loop instead
---
 proxy/src/http/conn_pool.rs | 42 +++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/proxy/src/http/conn_pool.rs b/proxy/src/http/conn_pool.rs
index c02ec61945..e771e5d7ed 100644
--- a/proxy/src/http/conn_pool.rs
+++ b/proxy/src/http/conn_pool.rs
@@ -408,9 +408,9 @@ async fn connect_to_compute_once(
     let (tx, mut rx) = tokio::sync::watch::channel(session);
 
     let conn_id = uuid::Uuid::new_v4();
-    let span = info_span!(parent: None, "connection", %conn_info, %conn_id);
+    let span = info_span!(parent: None, "connection", %conn_id);
     span.in_scope(|| {
-        info!(%session, "new connection");
+        info!(%conn_info, %session, "new connection");
     });
 
     tokio::spawn(
@@ -420,26 +420,28 @@ async fn connect_to_compute_once(
                 info!(%session, "changed session");
             }
 
-            let message = ready!(connection.poll_message(cx));
+            loop {
+                let message = ready!(connection.poll_message(cx));
 
-            match message {
-                Some(Ok(AsyncMessage::Notice(notice))) => {
-                    info!(%session, "notice: {}", notice);
-                    Poll::Pending
+                match message {
+                    Some(Ok(AsyncMessage::Notice(notice))) => {
+                        info!(%session, "notice: {}", notice);
+                    }
+                    Some(Ok(AsyncMessage::Notification(notif))) => {
+                        warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
+                    }
+                    Some(Ok(_)) => {
+                        warn!(%session, "unknown message");
+                    }
+                    Some(Err(e)) => {
+                        error!(%session, "connection error: {}", e);
+                        return Poll::Ready(())
+                    }
+                    None => {
+                        info!("connection closed");
+                        return Poll::Ready(())
+                    }
                 }
-                Some(Ok(AsyncMessage::Notification(notif))) => {
-                    warn!(%session, pid = notif.process_id(), channel = notif.channel(), "notification received");
-                    Poll::Pending
-                }
-                Some(Ok(_)) => {
-                    warn!(%session, "unknown message");
-                    Poll::Pending
-                }
-                Some(Err(e)) => {
-                    error!(%session, "connection error: {}", e);
-                    Poll::Ready(())
-                }
-                None => Poll::Ready(()),
             }
         })
         .instrument(span)

From 521438a5c6bbc5ecc061f35593b2dbb1715e0e06 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 12 Sep 2023 11:23:46 +0200
Subject: [PATCH 071/209] fix deadlock around TENANTS (#5285)

The sequence that can lead to a deadlock:

1. DELETE request gets all the way to `tenant.shutdown(progress,
false).await.is_err() ` , while holding TENANTS.read()
2. POST request for tenant creation comes in, calls `tenant_map_insert`,
it does `let mut guard = TENANTS.write().await;`
3. Something that `tenant.shutdown()` needs to wait for needs a
`TENANTS.read().await`.
The only case identified in exhaustive manual scanning of the code base
is this one:
Imitate size access does `get_tenant().await`, which does
`TENANTS.read().await` under the hood.

In the above case (1) waits for (3), (3)'s read-lock request is queued
behind (2)'s write-lock, and (2) waits for (1).
Deadlock.

I made a reproducer/proof-that-above-hypothesis-holds in
https://github.com/neondatabase/neon/pull/5281 , but, it's not ready for
merge yet and we want the fix _now_.

fixes https://github.com/neondatabase/neon/issues/5284
---
 pageserver/src/tenant/mgr.rs                  |  2 ++
 .../src/tenant/timeline/eviction_task.rs      | 21 ++++++++++++++++---
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 0cf5c36b87..74faee1115 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -503,6 +503,8 @@ pub enum GetTenantError {
 
 /// Gets the tenant from the in-memory data, erroring if it's absent or is not fitting to the query.
 /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
+///
+/// This method is cancel-safe.
 pub async fn get_tenant(
     tenant_id: TenantId,
     active_only: bool,
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 3e407dda57..39f0d03a01 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -328,9 +328,24 @@ impl Timeline {
         // Make one of the tenant's timelines draw the short straw and run the calculation.
         // The others wait until the calculation is done so that they take into account the
         // imitated accesses that the winner made.
-        let Ok(tenant) = crate::tenant::mgr::get_tenant(self.tenant_id, true).await else {
-            // likely, we're shutting down
-            return ControlFlow::Break(());
+        //
+        // It is critical we are responsive to cancellation here. Otherwise, we deadlock with
+        // tenant deletion (holds TENANTS in read mode) any other task that attempts to
+        // acquire TENANTS in write mode before we here call get_tenant.
+        // See https://github.com/neondatabase/neon/issues/5284.
+        let res = tokio::select! {
+            _ = cancel.cancelled() => {
+                return ControlFlow::Break(());
+            }
+            res = crate::tenant::mgr::get_tenant(self.tenant_id, true) => {
+                res
+            }
+        };
+        let tenant = match res {
+            Ok(t) => t,
+            Err(_) => {
+                return ControlFlow::Break(());
+            }
         };
         let mut state = tenant.eviction_task_tenant_state.lock().await;
         match state.last_layer_access_imitation {

From 373c7057cc73fd13f2873c353ced250ddb900700 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Thu, 14 Sep 2023 03:21:50 -0700
Subject: [PATCH 072/209] vm-monitor: Fix cgroup throttling (#5303)

I believe this (not actual IO problems) is the cause of the "disk speed
issue" that we've had for VMs recently. See e.g.:

1. https://neondb.slack.com/archives/C03H1K0PGKH/p1694287808046179?thread_ts=1694271790.580099&cid=C03H1K0PGKH
2. https://neondb.slack.com/archives/C03H1K0PGKH/p1694511932560659

The vm-informant (and now, the vm-monitor, its replacement) is supposed
to gradually increase the `neon-postgres` cgroup's memory.high value,
because otherwise the kernel will throttle all the processes in the
cgroup.

This PR fixes a bug with the vm-monitor's implementation of this
behavior.

---

Other references, for the vm-informant's implementation:

- Original issue: neondatabase/autoscaling#44
- Original PR: neondatabase/autoscaling#223
---
 libs/vm_monitor/src/cgroup.rs | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs
index 4f529d16fd..fe1c514f76 100644
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -315,12 +315,8 @@ impl CgroupWatcher {
     where
         E: Stream<Item = Sequenced<u64>>,
     {
-        // There are several actions might do when receiving a `memory.high`,
-        // such as freezing the cgroup, or increasing its `memory.high`. We don't
-        // want to do these things too often (because postgres needs to run, and
-        // we only have so much memory). These timers serve as rate limits for this.
         let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
-        let mut wait_to_increase_memory_high = pin!(tokio::time::sleep(Duration::ZERO));
+        let mut last_memory_high_increase_at: Option<Instant> = None;
         let mut events = pin!(events);
 
         // Are we waiting to be upscaled? Could be true if we request upscale due
@@ -332,6 +328,8 @@ impl CgroupWatcher {
                 upscale = upscales.recv() => {
                     let Sequenced { seqnum, data } = upscale
                         .context("failed to listen on upscale notification channel")?;
+                    waiting_on_upscale = false;
+                    last_memory_high_increase_at = None;
                     self.last_upscale_seqnum.store(seqnum, Ordering::Release);
                     info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
                 }
@@ -396,12 +394,17 @@ impl CgroupWatcher {
                             .send(())
                             .await
                             .context("failed to request upscale")?;
+                        waiting_on_upscale = true;
                         continue;
                     }
 
                     // Shoot, we can't freeze or and we're still waiting on upscale,
                     // increase memory.high to reduce throttling
-                    if wait_to_increase_memory_high.is_elapsed() {
+                    let can_increase_memory_high = match last_memory_high_increase_at {
+                        None => true,
+                        Some(t) => t.elapsed() > self.config.memory_high_increase_every,
+                    };
+                    if can_increase_memory_high {
                         info!(
                             "received memory.high event, \
                             but too soon to refreeze and already requested upscale \
@@ -437,12 +440,11 @@ impl CgroupWatcher {
                         );
                         self.set_high_bytes(new_high)
                             .context("failed to set memory.high")?;
-                        wait_to_increase_memory_high
-                            .as_mut()
-                            .reset(Instant::now() + self.config.memory_high_increase_every)
+                        last_memory_high_increase_at = Some(Instant::now());
+                        continue;
                     }
 
-                    // we can't do anything
+                    info!("received memory.high event, but can't do anything");
                 }
             };
         }

From 6686ede30f4e5271c16616d6ca08a17ab307e9c8 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 14 Sep 2023 16:17:50 +0100
Subject: [PATCH 073/209] Update checksum for pg_hint_plan (#5309)

## Problem

The checksum for `pg_hint_plan` doesn't match:
```
sha256sum: WARNING: 1 computed checksum did NOT match
```

Ref
https://github.com/neondatabase/neon/actions/runs/6185715461/job/16793609251?pr=5307

It seems that the release was retagged yesterday:
https://github.com/ossc-db/pg_hint_plan/releases/tag/REL16_1_6_0

I don't see any malicious changes from 15_1.5.1:
https://github.com/ossc-db/pg_hint_plan/compare/REL15_1_5_1...REL16_1_6_0,
so it should be ok to update.

## Summary of changes
- Update checksum for `pg_hint_plan` 16_1.6.0
---
 Dockerfile.compute-node | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index ecf76f97c2..d724185d5d 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -416,7 +416,7 @@ RUN case "${PG_VERSION}" in \
         ;; \
       "v16") \
         export PG_HINT_PLAN_VERSION=16_1_6_0 \
-        export PG_HINT_PLAN_CHECKSUM=ce6a8040c78012000f5da7240caf6a971401412f41d33f930f09291e6c304b99 \
+        export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \
         ;; \
       *) \
         echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \

From a5987eebfdf1ea3cc39d9730ed9f1f092e0ebf0e Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 15 Sep 2023 10:32:25 +0300
Subject: [PATCH 074/209] References to old and new blocks were mixed in
 xlog_heap_update handler (#5312)

## Problem

See https://neondb.slack.com/archives/C05L7D1JAUS/p1694614585955029

https://www.notion.so/neondatabase/Duplicate-key-issue-651627ce843c45188fbdcb2d30fd2178

## Summary of changes

Swap old/new block references

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pageserver/src/walingest.rs         | 16 +++++-----
 test_runner/regress/test_vm_bits.py | 48 +++++++++++++++++++++++------
 2 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 9192af0ee8..d5ef1f2a24 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -470,14 +470,14 @@ impl<'a> WalIngest<'a> {
                         // we can't validate the remaining number of bytes without parsing
                         // the tuple data.
                         if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
                         }
                         if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
                             // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
                             // non-HOT update where the new tuple goes to different page than
                             // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
                             // set.
-                            new_heap_blkno = Some(decoded.blocks[1].blkno);
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                         }
                     }
                 } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
@@ -526,14 +526,14 @@ impl<'a> WalIngest<'a> {
                         // we can't validate the remaining number of bytes without parsing
                         // the tuple data.
                         if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
                         }
                         if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
                             // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
                             // non-HOT update where the new tuple goes to different page than
                             // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
                             // set.
-                            new_heap_blkno = Some(decoded.blocks[1].blkno);
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                         }
                     }
                 } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
@@ -582,14 +582,14 @@ impl<'a> WalIngest<'a> {
                         // we can't validate the remaining number of bytes without parsing
                         // the tuple data.
                         if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
                         }
                         if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
                             // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
                             // non-HOT update where the new tuple goes to different page than
                             // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
                             // set.
-                            new_heap_blkno = Some(decoded.blocks[1].blkno);
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                         }
                     }
                 } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
@@ -745,14 +745,14 @@ impl<'a> WalIngest<'a> {
                         // we can't validate the remaining number of bytes without parsing
                         // the tuple data.
                         if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
                         }
                         if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
                             // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
                             // non-HOT update where the new tuple goes to different page than
                             // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
                             // set.
-                            new_heap_blkno = Some(decoded.blocks[1].blkno);
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
                         }
                     }
                     pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => {
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index d8034b31b0..8e20a44407 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -19,18 +19,40 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
     # Install extension containing function needed for test
     cur.execute("CREATE EXTENSION neon_test_utils")
 
-    # Create a test table and freeze it to set the VM bit.
+    # Create a test table for a few different scenarios and freeze it to set the VM bits.
     cur.execute("CREATE TABLE vmtest_delete (id integer PRIMARY KEY)")
     cur.execute("INSERT INTO vmtest_delete VALUES (1)")
     cur.execute("VACUUM FREEZE vmtest_delete")
 
-    cur.execute("CREATE TABLE vmtest_update (id integer PRIMARY KEY)")
-    cur.execute("INSERT INTO vmtest_update SELECT g FROM generate_series(1, 1000) g")
-    cur.execute("VACUUM FREEZE vmtest_update")
+    cur.execute("CREATE TABLE vmtest_hot_update (id integer PRIMARY KEY, filler text)")
+    cur.execute("INSERT INTO vmtest_hot_update VALUES (1, 'x')")
+    cur.execute("VACUUM FREEZE vmtest_hot_update")
+
+    cur.execute("CREATE TABLE vmtest_cold_update (id integer PRIMARY KEY)")
+    cur.execute("INSERT INTO vmtest_cold_update SELECT g FROM generate_series(1, 1000) g")
+    cur.execute("VACUUM FREEZE vmtest_cold_update")
+
+    cur.execute(
+        "CREATE TABLE vmtest_cold_update2 (id integer PRIMARY KEY, filler text) WITH (fillfactor=100)"
+    )
+    cur.execute("INSERT INTO vmtest_cold_update2 SELECT g, '' FROM generate_series(1, 1000) g")
+    cur.execute("VACUUM FREEZE vmtest_cold_update2")
 
     # DELETE and UPDATE the rows.
     cur.execute("DELETE FROM vmtest_delete WHERE id = 1")
-    cur.execute("UPDATE vmtest_update SET id = 5000 WHERE id = 1")
+    cur.execute("UPDATE vmtest_hot_update SET filler='x' WHERE id = 1")
+    cur.execute("UPDATE vmtest_cold_update SET id = 5000 WHERE id = 1")
+
+    # Clear the VM bit on the last page with an INSERT. Then clear the VM bit on
+    # the page where row 1 is (block 0), by doing an UPDATE. The UPDATE is a
+    # cold update, and the new tuple goes to the last page, which already had
+    # its VM bit cleared. The point is that the UPDATE *only* clears the VM bit
+    # on the page containing the old tuple. We had a bug where we got the old
+    # and new pages mixed up, and that only shows up when one of the bits is
+    # cleared, but not the other one.
+    cur.execute("INSERT INTO vmtest_cold_update2 VALUES (9999, 'x')")
+    # Clears the VM bit on the old page
+    cur.execute("UPDATE vmtest_cold_update2 SET id = 5000, filler=repeat('x', 200) WHERE id = 1")
 
     # Branch at this point, to test that later
     fork_at_current_lsn(env, endpoint, "test_vm_bit_clear_new", "test_vm_bit_clear")
@@ -50,9 +72,13 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
     """
     )
 
-    cur.execute("SELECT * FROM vmtest_delete WHERE id = 1")
+    cur.execute("SELECT id FROM vmtest_delete WHERE id = 1")
     assert cur.fetchall() == []
-    cur.execute("SELECT * FROM vmtest_update WHERE id = 1")
+    cur.execute("SELECT id FROM vmtest_hot_update WHERE id = 1")
+    assert cur.fetchall() == [(1,)]
+    cur.execute("SELECT id FROM vmtest_cold_update WHERE id = 1")
+    assert cur.fetchall() == []
+    cur.execute("SELECT id FROM vmtest_cold_update2 WHERE id = 1")
     assert cur.fetchall() == []
 
     cur.close()
@@ -77,7 +103,11 @@ def test_vm_bit_clear(neon_simple_env: NeonEnv):
     """
     )
 
-    cur_new.execute("SELECT * FROM vmtest_delete WHERE id = 1")
+    cur_new.execute("SELECT id FROM vmtest_delete WHERE id = 1")
     assert cur_new.fetchall() == []
-    cur_new.execute("SELECT * FROM vmtest_update WHERE id = 1")
+    cur_new.execute("SELECT id FROM vmtest_hot_update WHERE id = 1")
+    assert cur_new.fetchall() == [(1,)]
+    cur_new.execute("SELECT id FROM vmtest_cold_update WHERE id = 1")
+    assert cur_new.fetchall() == []
+    cur_new.execute("SELECT id FROM vmtest_cold_update2 WHERE id = 1")
     assert cur_new.fetchall() == []

From 46857e82828f570750cd2ac52eedf7260500a192 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 15 Sep 2023 15:27:00 +0100
Subject: [PATCH 075/209] Postgres 14/15: Use previous extensions versions

---
 Dockerfile.compute-node | 170 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 150 insertions(+), 20 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index d724185d5d..f7a15312ce 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -74,8 +74,21 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar
 
 ENV PATH "/usr/local/pgsql/bin:$PATH"
 
-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
-    echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export POSTGIS_VERSION=3.3.2 \
+        export POSTGIS_CHECKSUM=9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 \
+        ;; \
+      "v16") \
+        export POSTGIS_VERSION=3.3.3 \
+        export POSTGIS_CHECKSUM=74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \
+    echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \
     mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
     find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     ./autogen.sh && \
@@ -124,8 +137,21 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
     apt install -y ninja-build python3-dev libncurses5 binutils clang
 
-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.8.tar.gz -O plv8.tar.gz && \
-    echo "92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 plv8.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export PLV8_VERSION=3.1.5 \
+        export PLV8_CHECKSUM=1e108d5df639e4c189e1c5bdfa2432a521c126ca89e7e5a969d46899ca7bf106 \
+        ;; \
+      "v16") \
+        export PLV8_VERSION=3.1.8 \
+        export PLV8_CHECKSUM=92b10c7db39afdae97ff748c9ec54713826af222c459084ad002571b79eb3f49 \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/plv8/plv8/archive/refs/tags/v${PLV8_VERSION}.tar.gz -O plv8.tar.gz && \
+    echo "${PLV8_CHECKSUM} plv8.tar.gz" | sha256sum --check && \
     mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -172,8 +198,21 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz
     cp -R /h3/usr / && \
     rm -rf build
 
-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
-    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export H3PG_VERSION=4.1.2 \
+        export H3PG_CHECKSUM=c135aa45999b2ad1326d2537c1cadef96d52660838e4ca371706c08fdea1a956 \
+        ;; \
+      "v16") \
+        export H3PG_VERSION=4.1.3 \
+        export H3PG_CHECKSUM=5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/zachasme/h3-pg/archive/refs/tags/v${H3PG_VERSION}.tar.gz -O h3-pg.tar.gz && \
+    echo "${H3PG_CHECKSUM} h3-pg.tar.gz" | sha256sum --check && \
     mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -243,8 +282,21 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214
 FROM build-deps AS hypopg-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
-    echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export HYPOPG_VERSION=1.3.1 \
+        export HYPOPG_CHECKSUM=e7f01ee0259dc1713f318a108f987663d60f3041948c2ada57a94b469565ca8e \
+        ;; \
+      "v16") \
+        export HYPOPG_VERSION=1.4.0 \
+        export HYPOPG_CHECKSUM=0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/HypoPG/hypopg/archive/refs/tags/${HYPOPG_VERSION}.tar.gz -O hypopg.tar.gz && \
+    echo "${HYPOPG_CHECKSUM} hypopg.tar.gz" | sha256sum --check && \
     mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -307,8 +359,21 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta
 FROM build-deps AS ip4r-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
-    echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export IP4R_VERSION=2.4.1 \
+        export IP4R_CHECKSUM=78b9f0c1ae45c22182768fe892a32d533c82281035e10914111400bf6301c726 \
+        ;; \
+      "v16") \
+        export IP4R_VERSION=2.4.2 \
+        export IP4R_CHECKSUM=0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/${IP4R_VERSION}.tar.gz -O ip4r.tar.gz && \
+    echo "${IP4R_CHECKSUM} ip4r.tar.gz" | sha256sum --check && \
     mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -323,8 +388,21 @@ RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O i
 FROM build-deps AS prefix-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
-    echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export PREFIX_VERSION=1.2.9 \
+        export PREFIX_CHECKSUM=38d30a08d0241a8bbb8e1eb8f0152b385051665a8e621c8899e7c5068f8b511e \
+        ;; \
+      "v16") \
+        export PREFIX_VERSION=1.2.10 \
+        export PREFIX_CHECKSUM=4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/dimitri/prefix/archive/refs/tags/v$P{PREFIX_VERSION}.tar.gz -O prefix.tar.gz && \
+    echo "${PREFIX_CHECKSUM} prefix.tar.gz" | sha256sum --check && \
     mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -339,8 +417,21 @@ RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O p
 FROM build-deps AS hll-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
-    echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export HLL_VERSION=2.17 \
+        export HLL_CHECKSUM=9a18288e884f197196b0d29b9f178ba595b0dfc21fbf7a8699380e77fa04c1e9 \
+        ;; \
+      "v16") \
+        export HLL_VERSION=2.18 \
+        export HLL_CHECKSUM=e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v${HLL_VERSION}.tar.gz -O hll.tar.gz && \
+    echo "${HLL_CHECKSUM} hll.tar.gz" | sha256sum --check && \
     mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -355,8 +446,21 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
 FROM build-deps AS plpgsql-check-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz -O plpgsql_check.tar.gz && \
-    echo "9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a plpgsql_check.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export PLPGSQL_CHECK_VERSION=2.3.2 \
+        export PLPGSQL_CHECK_CHECKSUM=9d81167c4bbeb74eebf7d60147b21961506161addc2aee537f95ad8efeae427b \
+        ;; \
+      "v16") \
+        export PLPGSQL_CHECK_VERSION=2.4.0 \
+        export PLPGSQL_CHECK_CHECKSUM=9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v${PLPGSQL_CHECK_VERSION}.tar.gz -O plpgsql_check.tar.gz && \
+    echo "${PLPGSQL_CHECK_CHECKSUM} plpgsql_check.tar.gz" | sha256sum --check && \
     mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -465,8 +569,21 @@ FROM build-deps AS pg-cron-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
-    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export PG_CRON_VERSION=1.5.2 \
+        export PG_CRON_CHECKSUM=6f7f0980c03f1e2a6a747060e67bf4a303ca2a50e941e2c19daeed2b44dec744 \
+        ;; \
+      "v16") \
+        export PG_CRON_VERSION=1.6.0 \
+        export PG_CRON_CHECKSUM=383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/citusdata/pg_cron/archive/refs/tags/v${PG_CRON_VERSION}.tar.gz -O pg_cron.tar.gz && \
+    echo "${PG_CRON_CHECKSUM} pg_cron.tar.gz" | sha256sum --check && \
     mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -492,8 +609,21 @@ RUN apt-get update && \
         libfreetype6-dev
 
 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
-RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
-    echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export RDKIT_VERSION=2023_03_1 \
+        export RDKIT_CHECKSUM=db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c \
+        ;; \
+      "v16") \
+        export RDKIT_VERSION=2023_03_3 \
+        export RDKIT_CHECKSUM=bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d \
+        ;; \
+      *) \
+        echo "Export the valid PG_VERSION variable" && exit 1 \
+        ;; \
+    esac && \
+    wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_${RDKIT_VERSION}.tar.gz -O rdkit.tar.gz && \
+    echo "${RDKIT_CHECKSUM} rdkit.tar.gz" | sha256sum --check && \
     mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
     cmake \
         -D RDK_BUILD_CAIRO_SUPPORT=OFF \

From 23ee4f3050539125e0b7a731f3f6de4637a1a1a9 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 15 Sep 2023 15:45:23 +0100
Subject: [PATCH 076/209] Revert plv8 only

---
 Dockerfile.compute-node | 153 +++++-----------------------------------
 1 file changed, 18 insertions(+), 135 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index f7a15312ce..892c5e93f3 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -74,21 +74,8 @@ RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar
 
 ENV PATH "/usr/local/pgsql/bin:$PATH"
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export POSTGIS_VERSION=3.3.2 \
-        export POSTGIS_CHECKSUM=9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 \
-        ;; \
-      "v16") \
-        export POSTGIS_VERSION=3.3.3 \
-        export POSTGIS_CHECKSUM=74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://download.osgeo.org/postgis/source/postgis-${POSTGIS_VERSION}.tar.gz -O postgis.tar.gz && \
-    echo "${POSTGIS_CHECKSUM} postgis.tar.gz" | sha256sum --check && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.3.tar.gz -O postgis.tar.gz && \
+    echo "74eb356e3f85f14233791013360881b6748f78081cc688ff9d6f0f673a762d13 postgis.tar.gz" | sha256sum --check && \
     mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
     find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     ./autogen.sh && \
@@ -198,21 +185,8 @@ RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz
     cp -R /h3/usr / && \
     rm -rf build
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export H3PG_VERSION=4.1.2 \
-        export H3PG_CHECKSUM=c135aa45999b2ad1326d2537c1cadef96d52660838e4ca371706c08fdea1a956 \
-        ;; \
-      "v16") \
-        export H3PG_VERSION=4.1.3 \
-        export H3PG_CHECKSUM=5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/zachasme/h3-pg/archive/refs/tags/v${H3PG_VERSION}.tar.gz -O h3-pg.tar.gz && \
-    echo "${H3PG_CHECKSUM} h3-pg.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.3.tar.gz -O h3-pg.tar.gz && \
+    echo "5c17f09a820859ffe949f847bebf1be98511fb8f1bd86f94932512c00479e324 h3-pg.tar.gz" | sha256sum --check && \
     mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
@@ -282,21 +256,8 @@ RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b214
 FROM build-deps AS hypopg-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export HYPOPG_VERSION=1.3.1 \
-        export HYPOPG_CHECKSUM=e7f01ee0259dc1713f318a108f987663d60f3041948c2ada57a94b469565ca8e \
-        ;; \
-      "v16") \
-        export HYPOPG_VERSION=1.4.0 \
-        export HYPOPG_CHECKSUM=0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/HypoPG/hypopg/archive/refs/tags/${HYPOPG_VERSION}.tar.gz -O hypopg.tar.gz && \
-    echo "${HYPOPG_CHECKSUM} hypopg.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
+    echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
     mkdir hypopg-src && cd hypopg-src && tar xvzf ../hypopg.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -359,21 +320,8 @@ RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgta
 FROM build-deps AS ip4r-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export IP4R_VERSION=2.4.1 \
-        export IP4R_CHECKSUM=78b9f0c1ae45c22182768fe892a32d533c82281035e10914111400bf6301c726 \
-        ;; \
-      "v16") \
-        export IP4R_VERSION=2.4.2 \
-        export IP4R_CHECKSUM=0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/${IP4R_VERSION}.tar.gz -O ip4r.tar.gz && \
-    echo "${IP4R_CHECKSUM} ip4r.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/RhodiumToad/ip4r/archive/refs/tags/2.4.2.tar.gz -O ip4r.tar.gz && \
+    echo "0f7b1f159974f49a47842a8ab6751aecca1ed1142b6d5e38d81b064b2ead1b4b ip4r.tar.gz" | sha256sum --check && \
     mkdir ip4r-src && cd ip4r-src && tar xvzf ../ip4r.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -388,21 +336,8 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS prefix-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export PREFIX_VERSION=1.2.9 \
-        export PREFIX_CHECKSUM=38d30a08d0241a8bbb8e1eb8f0152b385051665a8e621c8899e7c5068f8b511e \
-        ;; \
-      "v16") \
-        export PREFIX_VERSION=1.2.10 \
-        export PREFIX_CHECKSUM=4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/dimitri/prefix/archive/refs/tags/v$P{PREFIX_VERSION}.tar.gz -O prefix.tar.gz && \
-    echo "${PREFIX_CHECKSUM} prefix.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/dimitri/prefix/archive/refs/tags/v1.2.10.tar.gz -O prefix.tar.gz && \
+    echo "4342f251432a5f6fb05b8597139d3ccde8dcf87e8ca1498e7ee931ca057a8575 prefix.tar.gz" | sha256sum --check && \
     mkdir prefix-src && cd prefix-src && tar xvzf ../prefix.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -417,21 +352,8 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS hll-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export HLL_VERSION=2.17 \
-        export HLL_CHECKSUM=9a18288e884f197196b0d29b9f178ba595b0dfc21fbf7a8699380e77fa04c1e9 \
-        ;; \
-      "v16") \
-        export HLL_VERSION=2.18 \
-        export HLL_CHECKSUM=e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v${HLL_VERSION}.tar.gz -O hll.tar.gz && \
-    echo "${HLL_CHECKSUM} hll.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar.gz -O hll.tar.gz && \
+    echo "e2f55a6f4c4ab95ee4f1b4a2b73280258c5136b161fe9d059559556079694f0e hll.tar.gz" | sha256sum --check && \
     mkdir hll-src && cd hll-src && tar xvzf ../hll.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -446,21 +368,8 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS plpgsql-check-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export PLPGSQL_CHECK_VERSION=2.3.2 \
-        export PLPGSQL_CHECK_CHECKSUM=9d81167c4bbeb74eebf7d60147b21961506161addc2aee537f95ad8efeae427b \
-        ;; \
-      "v16") \
-        export PLPGSQL_CHECK_VERSION=2.4.0 \
-        export PLPGSQL_CHECK_CHECKSUM=9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v${PLPGSQL_CHECK_VERSION}.tar.gz -O plpgsql_check.tar.gz && \
-    echo "${PLPGSQL_CHECK_CHECKSUM} plpgsql_check.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz -O plpgsql_check.tar.gz && \
+    echo "9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a plpgsql_check.tar.gz" | sha256sum --check && \
     mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -569,21 +478,8 @@ FROM build-deps AS pg-cron-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export PG_CRON_VERSION=1.5.2 \
-        export PG_CRON_CHECKSUM=6f7f0980c03f1e2a6a747060e67bf4a303ca2a50e941e2c19daeed2b44dec744 \
-        ;; \
-      "v16") \
-        export PG_CRON_VERSION=1.6.0 \
-        export PG_CRON_CHECKSUM=383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/citusdata/pg_cron/archive/refs/tags/v${PG_CRON_VERSION}.tar.gz -O pg_cron.tar.gz && \
-    echo "${PG_CRON_CHECKSUM} pg_cron.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
+    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
     mkdir pg_cron-src && cd pg_cron-src && tar xvzf ../pg_cron.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -609,21 +505,8 @@ RUN apt-get update && \
         libfreetype6-dev
 
 ENV PATH "/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export RDKIT_VERSION=2023_03_1 \
-        export RDKIT_CHECKSUM=db346afbd0ba52c843926a2a62f8a38c7b774ffab37eaf382d789a824f21996c \
-        ;; \
-      "v16") \
-        export RDKIT_VERSION=2023_03_3 \
-        export RDKIT_CHECKSUM=bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d \
-        ;; \
-      *) \
-        echo "Export the valid PG_VERSION variable" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_${RDKIT_VERSION}.tar.gz -O rdkit.tar.gz && \
-    echo "${RDKIT_CHECKSUM} rdkit.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
+    echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
     mkdir rdkit-src && cd rdkit-src && tar xvzf ../rdkit.tar.gz --strip-components=1 -C . && \
     cmake \
         -D RDK_BUILD_CAIRO_SUPPORT=OFF \

From ae0634b7beea1777d734e66089b8eb45cd656861 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Thu, 28 Sep 2023 00:52:39 -0700
Subject: [PATCH 077/209] Bump vm-builder v0.17.11 -> v0.17.12 (#5407)

Only relevant change is neondatabase/autoscaling#534 - refer there for
more details.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 7271a8d29f..65a2101dc6 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -834,7 +834,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.17.11
+      VM_BUILDER_VERSION: v0.17.12
 
     steps:
       - name: Checkout

From 72aa6b9fdd5b0ccbfe6aec8add2d7a951d4cc5a3 Mon Sep 17 00:00:00 2001
From: MMeent <matthias@neon.tech>
Date: Wed, 27 Sep 2023 13:48:30 +0200
Subject: [PATCH 078/209] Fix neon_zeroextend's WAL logging (#5387)

When you log more than a few blocks, you need to reserve the space in
advance. We didn't do that, so we got errors. Now we do that, and
shouldn't get errors.
---
 pgxn/neon/pagestore_smgr.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 919bca03e9..2e4364cbfa 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1790,6 +1790,14 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	if (!XLogInsertAllowed())
 		return;
 
+	/* ensure we have enough xlog buffers to log max-sized records */
+	XLogEnsureRecordSpace(Min(remblocks, (XLR_MAX_BLOCK_ID - 1)), 0);
+
+	/*
+	 * Iterate over all the pages. They are collected into batches of
+	 * XLR_MAX_BLOCK_ID pages, and a single WAL-record is written for each
+	 * batch.
+	 */
 	while (remblocks > 0)
 	{
 		int			count = Min(remblocks, XLR_MAX_BLOCK_ID);

From 7222777784eb84efc13093783d9aabbe1260b5ef Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 3 Oct 2023 18:42:39 +0100
Subject: [PATCH 079/209] Update checksums for pg_jsonschema & pg_graphql
 (#5455)

## Problem

Folks have re-taged releases for `pg_jsonschema` and `pg_graphql` (to
increase timeouts on their CI), for us, these are a noop changes,
but unfortunately, this will cause our builds to fail due to checksums
mismatch (this might not strike right away because of the build cache).
- https://github.com/supabase/pg_jsonschema/commit/8ba7c7be9d3f12c0cf30c7105db303ce2aaf12c2
- https://github.com/supabase/pg_graphql/commit/aa7509370a4d34a26d48126ac24bc937a009c115

## Summary of changes
- `pg_jsonschema` update checksum
- `pg_graphql` update checksum
---
 Dockerfile.compute-node | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index e53ec47688..7e34b66d68 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -651,7 +651,7 @@ FROM rust-extensions-build AS pg-jsonschema-pg-build
 ARG PG_VERSION
 
 RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
-    echo "b1bd95009c8809bd6cda9a37777f8b7df425ff1a34976c1e7a4b31cf838ace66 pg_jsonschema.tar.gz" | sha256sum --check && \
+    echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
     mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
@@ -668,7 +668,7 @@ FROM rust-extensions-build AS pg-graphql-pg-build
 ARG PG_VERSION
 
 RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
-    echo "ea85d45f8af1d2382e2af847f88102f930782c00e6c612308e6f08f27309d5f7 pg_graphql.tar.gz" | sha256sum --check && \
+    echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \

From a6b2f4e54ef97333c35834788bf555fc10e3f4bb Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 17 Oct 2023 11:29:48 +0200
Subject: [PATCH 080/209] limit imitate accesses concurrency, using same
 semaphore as compactions (#5578)

Before this PR, when we restarted pageserver, we'd see a rush of
`$number_of_tenants` concurrent eviction tasks starting to do imitate
accesses building up in the period of `[init_order allows activations,
$random_access_delay + EvictionPolicyLayerAccessThreshold::period]`.

We simply cannot handle that degree of concurrent IO.

We already solved the problem for compactions by adding a semaphore.
So, this PR shares that semaphore for use by evictions.

Part of https://github.com/neondatabase/neon/issues/5479

Which is again part of https://github.com/neondatabase/neon/issues/4743

Risks / Changes In System Behavior
==================================

* we don't do evictions as timely as we currently do
* we log a bunch of warnings about eviction taking too long
* imitate accesses and compactions compete for the same concurrency
limit, so, they'll slow each other down through this shares semaphore

Changes
=======

- Move the `CONCURRENT_COMPACTIONS` semaphore into `tasks.rs`
- Rename it to `CONCURRENT_BACKGROUND_TASKS`
- Use it also for the eviction imitate accesses:
    - Imitate acceses are both per-TIMELINE and per-TENANT
    - The per-TENANT is done through coalescing all the per-TIMELINE
      tasks via a tokio mutex `eviction_task_tenant_state`.
    - We acquire the CONCURRENT_BACKGROUND_TASKS permit early, at the
      beginning of the eviction iteration, much before the imitate
      acesses start (and they may not even start at all in the given
      iteration, as they happen only every $threshold).
    - Acquiring early is **sub-optimal** because when the per-timline
      tasks coalesce on the `eviction_task_tenant_state` mutex,
      they are already holding a CONCURRENT_BACKGROUND_TASKS permit.
    - It's also unfair because tenants with many timelines win
      the CONCURRENT_BACKGROUND_TASKS more often.
    - I don't think there's another way though, without refactoring
      more of the imitate accesses logic, e.g, making it all per-tenant.
- Add metrics for queue depth behind the semaphore.
I found these very useful to understand what work is queued in the
system.

    - The metrics are tagged by the new `BackgroundLoopKind`.
    - On a green slate, I would have used `TaskKind`, but we already had
      pre-existing labels whose names didn't map exactly to task kind.
      Also the task kind is kind of a lower-level detail, so, I think
it's fine to have a separate enum to identify background work kinds.

Future Work
===========

I guess I could move the eviction tasks from a ticker to "sleep for
$period".
The benefit would be that the semaphore automatically "smears" the
eviction task scheduling over time, so, we only have the rush on restart
but a smeared-out rush afterward.

The downside is that this perverts the meaning of "$period", as we'd
actually not run the eviction at a fixed period. It also means the the
"took to long" warning & metric becomes meaningless.

Then again, that is already the case for the compaction and gc tasks,
which do sleep for `$period` instead of using a ticker.

(cherry picked from commit 9256788273d5661ced0b2661a8751e2aa86fbb59)
---
 pageserver/src/consumption_metrics.rs         | 10 ++-
 pageserver/src/metrics.rs                     | 20 +++++
 pageserver/src/tenant/tasks.rs                | 81 +++++++++++++++++--
 pageserver/src/tenant/timeline.rs             | 39 +++------
 .../src/tenant/timeline/eviction_task.rs      | 18 ++++-
 5 files changed, 131 insertions(+), 37 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 72a2099d92..13f7977946 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -2,6 +2,7 @@
 //! and push them to a HTTP endpoint.
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
@@ -143,7 +144,7 @@ pub async fn collect_metrics(
         crate::tenant::tasks::warn_when_period_overrun(
             tick_at.elapsed(),
             metric_collection_interval,
-            "consumption_metrics_collect_metrics",
+            BackgroundLoopKind::ConsumptionMetricsCollectMetrics,
         );
     }
 }
@@ -268,6 +269,11 @@ async fn calculate_synthetic_size_worker(
             }
 
             if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
+                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
+                // We can put in some prioritization for consumption metrics.
+                // Same for the loop that fetches computed metrics.
+                // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
+                // which turns out is really handy to understand the system.
                 if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
                     error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                 }
@@ -277,7 +283,7 @@ async fn calculate_synthetic_size_worker(
         crate::tenant::tasks::warn_when_period_overrun(
             tick_at.elapsed(),
             synthetic_size_calculation_interval,
-            "consumption_metrics_synthetic_size_worker",
+            BackgroundLoopKind::ConsumptionMetricsSyntheticSizeWorker,
         );
     }
 }
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index c154b4a4ca..eea3de0583 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1067,6 +1067,26 @@ pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
     .expect("Failed to register tenant_task_events metric")
 });
 
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT: Lazy<IntCounterVec> =
+    Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_background_loop_semaphore_wait_start_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls started",
+            &["task"],
+        )
+        .unwrap()
+    });
+
+pub(crate) static BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT: Lazy<IntCounterVec> =
+    Lazy::new(|| {
+        register_int_counter_vec!(
+            "pageserver_background_loop_semaphore_wait_finish_count",
+            "Counter for background loop concurrency-limiting semaphore acquire calls finished",
+            &["task"],
+        )
+        .unwrap()
+    });
+
 pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "pageserver_background_loop_period_overrun_count",
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index df3ffd08d3..00c8ced68a 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -14,6 +14,73 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::completion;
 
+static CONCURRENT_BACKGROUND_TASKS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
+    once_cell::sync::Lazy::new(|| {
+        let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
+        let permits = usize::max(
+            1,
+            // while a lot of the work is done on spawn_blocking, we still do
+            // repartitioning in the async context. this should give leave us some workers
+            // unblocked to be blocked on other work, hopefully easing any outside visible
+            // effects of restarts.
+            //
+            // 6/8 is a guess; previously we ran with unlimited 8 and more from
+            // spawn_blocking.
+            (total_threads * 3).checked_div(4).unwrap_or(0),
+        );
+        assert_ne!(permits, 0, "we will not be adding in permits later");
+        assert!(
+            permits < total_threads,
+            "need threads avail for shorter work"
+        );
+        tokio::sync::Semaphore::new(permits)
+    });
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy, strum_macros::IntoStaticStr)]
+#[strum(serialize_all = "snake_case")]
+pub(crate) enum BackgroundLoopKind {
+    Compaction,
+    Gc,
+    Eviction,
+    ConsumptionMetricsCollectMetrics,
+    ConsumptionMetricsSyntheticSizeWorker,
+}
+
+impl BackgroundLoopKind {
+    fn as_static_str(&self) -> &'static str {
+        let s: &'static str = self.into();
+        s
+    }
+}
+
+pub(crate) enum RateLimitError {
+    Cancelled,
+}
+
+pub(crate) async fn concurrent_background_tasks_rate_limit(
+    loop_kind: BackgroundLoopKind,
+    _ctx: &RequestContext,
+    cancel: &CancellationToken,
+) -> Result<impl Drop, RateLimitError> {
+    crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT
+        .with_label_values(&[loop_kind.as_static_str()])
+        .inc();
+    scopeguard::defer!(
+        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc();
+    );
+    tokio::select! {
+        permit = CONCURRENT_BACKGROUND_TASKS.acquire() => {
+            match permit {
+                Ok(permit) => Ok(permit),
+                Err(_closed) => unreachable!("we never close the semaphore"),
+            }
+        },
+        _ = cancel.cancelled() => {
+            Err(RateLimitError::Cancelled)
+        }
+    }
+}
+
 /// Start per tenant background loops: compaction and gc.
 pub fn start_background_loops(
     tenant: &Arc<Tenant>,
@@ -116,7 +183,7 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
             };
 
-            warn_when_period_overrun(started_at.elapsed(), period, "compaction");
+            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
 
             // Sleep
             if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -184,7 +251,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                 }
             };
 
-            warn_when_period_overrun(started_at.elapsed(), period, "gc");
+            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
 
             // Sleep
             if tokio::time::timeout(sleep_duration, cancel.cancelled())
@@ -258,7 +325,11 @@ pub(crate) async fn random_init_delay(
 }
 
 /// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
-pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
+pub(crate) fn warn_when_period_overrun(
+    elapsed: Duration,
+    period: Duration,
+    task: BackgroundLoopKind,
+) {
     // Duration::ZERO will happen because it's the "disable [bgtask]" value.
     if elapsed >= period && period != Duration::ZERO {
         // humantime does no significant digits clamping whereas Duration's debug is a bit more
@@ -267,11 +338,11 @@ pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task
         warn!(
             ?elapsed,
             period = %humantime::format_duration(period),
-            task,
+            ?task,
             "task iteration took longer than the configured period"
         );
         crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
-            .with_label_values(&[task, &format!("{}", period.as_secs())])
+            .with_label_values(&[task.as_static_str(), &format!("{}", period.as_secs())])
             .inc();
     }
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 537f776176..a6f92646a9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -44,6 +44,7 @@ use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
     DeltaLayerWriter, ImageLayerWriter, InMemoryLayer, LayerAccessStats, LayerFileName, RemoteLayer,
 };
+use crate::tenant::tasks::{BackgroundLoopKind, RateLimitError};
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
     layer_map::{LayerMap, SearchResult},
@@ -684,37 +685,17 @@ impl Timeline {
     ) -> anyhow::Result<()> {
         const ROUNDS: usize = 2;
 
-        static CONCURRENT_COMPACTIONS: once_cell::sync::Lazy<tokio::sync::Semaphore> =
-            once_cell::sync::Lazy::new(|| {
-                let total_threads = *task_mgr::BACKGROUND_RUNTIME_WORKER_THREADS;
-                let permits = usize::max(
-                    1,
-                    // while a lot of the work is done on spawn_blocking, we still do
-                    // repartitioning in the async context. this should give leave us some workers
-                    // unblocked to be blocked on other work, hopefully easing any outside visible
-                    // effects of restarts.
-                    //
-                    // 6/8 is a guess; previously we ran with unlimited 8 and more from
-                    // spawn_blocking.
-                    (total_threads * 3).checked_div(4).unwrap_or(0),
-                );
-                assert_ne!(permits, 0, "we will not be adding in permits later");
-                assert!(
-                    permits < total_threads,
-                    "need threads avail for shorter work"
-                );
-                tokio::sync::Semaphore::new(permits)
-            });
-
         // this wait probably never needs any "long time spent" logging, because we already nag if
         // compaction task goes over it's period (20s) which is quite often in production.
-        let _permit = tokio::select! {
-            permit = CONCURRENT_COMPACTIONS.acquire() => {
-                permit
-            },
-            _ = cancel.cancelled() => {
-                return Ok(());
-            }
+        let _permit = match super::tasks::concurrent_background_tasks_rate_limit(
+            BackgroundLoopKind::Compaction,
+            ctx,
+            cancel,
+        )
+        .await
+        {
+            Ok(permit) => permit,
+            Err(RateLimitError::Cancelled) => return Ok(()),
         };
 
         let last_record_lsn = self.get_last_record_lsn();
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 9bf31d85d4..38da993deb 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,6 +30,7 @@ use crate::{
     tenant::{
         config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
         storage_layer::PersistentLayer,
+        tasks::{BackgroundLoopKind, RateLimitError},
         timeline::EvictionError,
         LogicalSizeCalculationCause, Tenant,
     },
@@ -129,7 +130,11 @@ impl Timeline {
                     ControlFlow::Continue(()) => (),
                 }
                 let elapsed = start.elapsed();
-                crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
+                crate::tenant::tasks::warn_when_period_overrun(
+                    elapsed,
+                    p.period,
+                    BackgroundLoopKind::Eviction,
+                );
                 crate::metrics::EVICTION_ITERATION_DURATION
                     .get_metric_with_label_values(&[
                         &format!("{}", p.period.as_secs()),
@@ -150,6 +155,17 @@ impl Timeline {
     ) -> ControlFlow<()> {
         let now = SystemTime::now();
 
+        let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit(
+            BackgroundLoopKind::Eviction,
+            ctx,
+            cancel,
+        )
+        .await
+        {
+            Ok(permit) => permit,
+            Err(RateLimitError::Cancelled) => return ControlFlow::Break(()),
+        };
+
         // If we evict layers but keep cached values derived from those layers, then
         // we face a storm of on-demand downloads after pageserver restart.
         // The reason is that the restart empties the caches, and so, the values

From 4d13bae449284088c496137b1af7501966c8a84f Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Tue, 17 Oct 2023 15:30:40 -0700
Subject: [PATCH 081/209] vm-monitor: Switch from memory.high to polling
 memory.stat (#5524)

tl;dr it's really hard to avoid throttling from memory.high, and it
counts tmpfs & page cache usage, so it's also hard to make sense of.

In the interest of fixing things quickly with something that should be
*good enough*, this PR switches to instead periodically fetch memory
statistics from the cgroup's memory.stat and use that data to determine
if and when we should upscale.

This PR fixes #5444, which has a lot more detail on the difficulties
we've hit with memory.high. This PR also supersedes #5488.
---
 libs/vm_monitor/README.md         |   4 +-
 libs/vm_monitor/src/cgroup.rs     | 786 ++++++++----------------------
 libs/vm_monitor/src/dispatcher.rs |  29 +-
 libs/vm_monitor/src/runner.rs     | 285 ++++++-----
 4 files changed, 366 insertions(+), 738 deletions(-)

diff --git a/libs/vm_monitor/README.md b/libs/vm_monitor/README.md
index 53cdecd9f3..fdd943077d 100644
--- a/libs/vm_monitor/README.md
+++ b/libs/vm_monitor/README.md
@@ -27,8 +27,8 @@ and old one if it exists.
 * the filecache: a struct that allows communication with the Postgres file cache.
 On startup, we connect to the filecache and hold on to the connection for the
 entire monitor lifetime.
-* the cgroup watcher: the `CgroupWatcher` manages the `neon-postgres` cgroup by
-listening for `memory.high` events and setting its `memory.{high,max}` values.
+* the cgroup watcher: the `CgroupWatcher` polls the `neon-postgres` cgroup's memory
+usage and sends rolling aggregates to the runner.
 * the runner: the runner marries the filecache and cgroup watcher together,
 communicating with the agent throught the `Dispatcher`, and then calling filecache
 and cgroup watcher functions as needed to upscale and downscale
diff --git a/libs/vm_monitor/src/cgroup.rs b/libs/vm_monitor/src/cgroup.rs
index 15e972505e..7160a42df2 100644
--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -1,161 +1,38 @@
-use std::{
-    fmt::{Debug, Display},
-    fs,
-    pin::pin,
-    sync::atomic::{AtomicU64, Ordering},
-};
+use std::fmt::{self, Debug, Formatter};
+use std::time::{Duration, Instant};
 
-use anyhow::{anyhow, bail, Context};
+use anyhow::{anyhow, Context};
 use cgroups_rs::{
-    freezer::FreezerController,
-    hierarchies::{self, is_cgroup2_unified_mode, UNIFIED_MOUNTPOINT},
+    hierarchies::{self, is_cgroup2_unified_mode},
     memory::MemController,
-    MaxValue,
-    Subsystem::{Freezer, Mem},
+    Subsystem,
 };
-use inotify::{EventStream, Inotify, WatchMask};
-use tokio::sync::mpsc::{self, error::TryRecvError};
-use tokio::time::{Duration, Instant};
-use tokio_stream::{Stream, StreamExt};
+use tokio::sync::watch;
 use tracing::{info, warn};
 
-use crate::protocol::Resources;
-use crate::MiB;
-
-/// Monotonically increasing counter of the number of memory.high events
-/// the cgroup has experienced.
-///
-/// We use this to determine if a modification to the `memory.events` file actually
-/// changed the `high` field. If not, we don't care about the change. When we
-/// read the file, we check the `high` field in the file against `MEMORY_EVENT_COUNT`
-/// to see if it changed since last time.
-pub static MEMORY_EVENT_COUNT: AtomicU64 = AtomicU64::new(0);
-
-/// Monotonically increasing counter that gives each cgroup event a unique id.
-///
-/// This allows us to answer questions like "did this upscale arrive before this
-/// memory.high?". This static is also used by the `Sequenced` type to "tag" values
-/// with a sequence number. As such, prefer to used the `Sequenced` type rather
-/// than this static directly.
-static EVENT_SEQUENCE_NUMBER: AtomicU64 = AtomicU64::new(0);
-
-/// A memory event type reported in memory.events.
-#[derive(Debug, Eq, PartialEq, Copy, Clone)]
-pub enum MemoryEvent {
-    Low,
-    High,
-    Max,
-    Oom,
-    OomKill,
-    OomGroupKill,
-}
-
-impl MemoryEvent {
-    fn as_str(&self) -> &str {
-        match self {
-            MemoryEvent::Low => "low",
-            MemoryEvent::High => "high",
-            MemoryEvent::Max => "max",
-            MemoryEvent::Oom => "oom",
-            MemoryEvent::OomKill => "oom_kill",
-            MemoryEvent::OomGroupKill => "oom_group_kill",
-        }
-    }
-}
-
-impl Display for MemoryEvent {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.write_str(self.as_str())
-    }
-}
-
 /// Configuration for a `CgroupWatcher`
 #[derive(Debug, Clone)]
 pub struct Config {
-    // The target difference between the total memory reserved for the cgroup
-    // and the value of the cgroup's memory.high.
-    //
-    // In other words, memory.high + oom_buffer_bytes will equal the total memory that the cgroup may
-    // use (equal to system memory, minus whatever's taken out for the file cache).
-    oom_buffer_bytes: u64,
+    /// Interval at which we should be fetching memory statistics
+    memory_poll_interval: Duration,
 
-    // The amount of memory, in bytes, below a proposed new value for
-    // memory.high that the cgroup's memory usage must be for us to downscale
-    //
-    // In other words, we can downscale only when:
-    //
-    //   memory.current + memory_high_buffer_bytes < (proposed) memory.high
-    //
-    // TODO: there's some minor issues with this approach -- in particular, that we might have
-    // memory in use by the kernel's page cache that we're actually ok with getting rid of.
-    pub(crate) memory_high_buffer_bytes: u64,
-
-    // The maximum duration, in milliseconds, that we're allowed to pause
-    // the cgroup for while waiting for the autoscaler-agent to upscale us
-    max_upscale_wait: Duration,
-
-    // The required minimum time, in milliseconds, that we must wait before re-freezing
-    // the cgroup while waiting for the autoscaler-agent to upscale us.
-    do_not_freeze_more_often_than: Duration,
-
-    // The amount of memory, in bytes, that we should periodically increase memory.high
-    // by while waiting for the autoscaler-agent to upscale us.
-    //
-    // This exists to avoid the excessive throttling that happens when a cgroup is above its
-    // memory.high for too long. See more here:
-    // https://github.com/neondatabase/autoscaling/issues/44#issuecomment-1522487217
-    memory_high_increase_by_bytes: u64,
-
-    // The period, in milliseconds, at which we should repeatedly increase the value
-    // of the cgroup's memory.high while we're waiting on upscaling and memory.high
-    // is still being hit.
-    //
-    // Technically speaking, this actually serves as a rate limit to moderate responding to
-    // memory.high events, but these are roughly equivalent if the process is still allocating
-    // memory.
-    memory_high_increase_every: Duration,
-}
-
-impl Config {
-    /// Calculate the new value for the cgroups memory.high based on system memory
-    pub fn calculate_memory_high_value(&self, total_system_mem: u64) -> u64 {
-        total_system_mem.saturating_sub(self.oom_buffer_bytes)
-    }
+    /// The number of samples used in constructing aggregated memory statistics
+    memory_history_len: usize,
+    /// The number of most recent samples that will be periodically logged.
+    ///
+    /// Each sample is logged exactly once. Increasing this value means that recent samples will be
+    /// logged less frequently, and vice versa.
+    ///
+    /// For simplicity, this value must be greater than or equal to `memory_history_len`.
+    memory_history_log_interval: usize,
 }
 
 impl Default for Config {
     fn default() -> Self {
         Self {
-            oom_buffer_bytes: 100 * MiB,
-            memory_high_buffer_bytes: 100 * MiB,
-            // while waiting for upscale, don't freeze for more than 20ms every 1s
-            max_upscale_wait: Duration::from_millis(20),
-            do_not_freeze_more_often_than: Duration::from_millis(1000),
-            // while waiting for upscale, increase memory.high by 10MiB every 25ms
-            memory_high_increase_by_bytes: 10 * MiB,
-            memory_high_increase_every: Duration::from_millis(25),
-        }
-    }
-}
-
-/// Used to represent data that is associated with a certain point in time, such
-/// as an upscale request or memory.high event.
-///
-/// Internally, creating a `Sequenced` uses a static atomic counter to obtain
-/// a unique sequence number. Sequence numbers are monotonically increasing,
-/// allowing us to answer questions like "did this upscale happen after this
-/// memory.high event?" by comparing the sequence numbers of the two events.
-#[derive(Debug, Clone)]
-pub struct Sequenced<T> {
-    seqnum: u64,
-    data: T,
-}
-
-impl<T> Sequenced<T> {
-    pub fn new(data: T) -> Self {
-        Self {
-            seqnum: EVENT_SEQUENCE_NUMBER.fetch_add(1, Ordering::AcqRel),
-            data,
+            memory_poll_interval: Duration::from_millis(100),
+            memory_history_len: 5, // use 500ms of history for decision-making
+            memory_history_log_interval: 20, // but only log every ~2s (otherwise it's spammy)
         }
     }
 }
@@ -170,74 +47,14 @@ impl<T> Sequenced<T> {
 pub struct CgroupWatcher {
     pub config: Config,
 
-    /// The sequence number of the last upscale.
-    ///
-    /// If we receive a memory.high event that has a _lower_ sequence number than
-    /// `last_upscale_seqnum`, then we know it occured before the upscale, and we
-    /// can safely ignore it.
-    ///
-    /// Note: Like the `events` field, this doesn't _need_ interior mutability but we
-    /// use it anyways so that methods take `&self`, not `&mut self`.
-    last_upscale_seqnum: AtomicU64,
-
-    /// A channel on which we send messages to request upscale from the dispatcher.
-    upscale_requester: mpsc::Sender<()>,
-
     /// The actual cgroup we are watching and managing.
     cgroup: cgroups_rs::Cgroup,
 }
 
-/// Read memory.events for the desired event type.
-///
-/// `path` specifies the path to the desired `memory.events` file.
-/// For more info, see the `memory.events` section of the [kernel docs]
-/// <https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files>
-fn get_event_count(path: &str, event: MemoryEvent) -> anyhow::Result<u64> {
-    let contents = fs::read_to_string(path)
-        .with_context(|| format!("failed to read memory.events from {path}"))?;
-
-    // Then contents of the file look like:
-    // low 42
-    // high 101
-    // ...
-    contents
-        .lines()
-        .filter_map(|s| s.split_once(' '))
-        .find(|(e, _)| *e == event.as_str())
-        .ok_or_else(|| anyhow!("failed to find entry for memory.{event} events in {path}"))
-        .and_then(|(_, count)| {
-            count
-                .parse::<u64>()
-                .with_context(|| format!("failed to parse memory.{event} as u64"))
-        })
-}
-
-/// Create an event stream that produces events whenever the file at the provided
-/// path is modified.
-fn create_file_watcher(path: &str) -> anyhow::Result<EventStream<[u8; 1024]>> {
-    info!("creating file watcher for {path}");
-    let inotify = Inotify::init().context("failed to initialize file watcher")?;
-    inotify
-        .watches()
-        .add(path, WatchMask::MODIFY)
-        .with_context(|| format!("failed to start watching {path}"))?;
-    inotify
-        // The inotify docs use [0u8; 1024] so we'll just copy them. We only need
-        // to store one event at a time - if the event gets written over, that's
-        // ok. We still see that there is an event. For more information, see:
-        // https://man7.org/linux/man-pages/man7/inotify.7.html
-        .into_event_stream([0u8; 1024])
-        .context("failed to start inotify event stream")
-}
-
 impl CgroupWatcher {
     /// Create a new `CgroupWatcher`.
     #[tracing::instrument(skip_all, fields(%name))]
-    pub fn new(
-        name: String,
-        // A channel on which to send upscale requests
-        upscale_requester: mpsc::Sender<()>,
-    ) -> anyhow::Result<(Self, impl Stream<Item = Sequenced<u64>>)> {
+    pub fn new(name: String) -> anyhow::Result<Self> {
         // TODO: clarify exactly why we need v2
         // Make sure cgroups v2 (aka unified) are supported
         if !is_cgroup2_unified_mode() {
@@ -245,410 +62,203 @@ impl CgroupWatcher {
         }
         let cgroup = cgroups_rs::Cgroup::load(hierarchies::auto(), &name);
 
-        // Start monitoring the cgroup for memory events. In general, for
-        // cgroups v2 (aka unified), metrics are reported in files like
-        // > `/sys/fs/cgroup/{name}/{metric}`
-        // We are looking for `memory.high` events, which are stored in the
-        // file `memory.events`. For more info, see the `memory.events` section
-        // of https://docs.kernel.org/admin-guide/cgroup-v2.html#memory-interface-files
-        let path = format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name);
-        let memory_events = create_file_watcher(&path)
-            .with_context(|| format!("failed to create event watcher for {path}"))?
-            // This would be nice with with .inspect_err followed by .ok
-            .filter_map(move |_| match get_event_count(&path, MemoryEvent::High) {
-                Ok(high) => Some(high),
-                Err(error) => {
-                    // TODO: Might want to just panic here
-                    warn!(?error, "failed to read high events count from {}", &path);
-                    None
-                }
-            })
-            // Only report the event if the memory.high count increased
-            .filter_map(|high| {
-                if MEMORY_EVENT_COUNT.fetch_max(high, Ordering::AcqRel) < high {
-                    Some(high)
-                } else {
-                    None
-                }
-            })
-            .map(Sequenced::new);
-
-        let initial_count = get_event_count(
-            &format!("{}/{}/memory.events", UNIFIED_MOUNTPOINT, &name),
-            MemoryEvent::High,
-        )?;
-
-        info!(initial_count, "initial memory.high event count");
-
-        // Hard update `MEMORY_EVENT_COUNT` since there could have been processes
-        // running in the cgroup before that caused it to be non-zero.
-        MEMORY_EVENT_COUNT.fetch_max(initial_count, Ordering::AcqRel);
-
-        Ok((
-            Self {
-                cgroup,
-                upscale_requester,
-                last_upscale_seqnum: AtomicU64::new(0),
-                config: Default::default(),
-            },
-            memory_events,
-        ))
+        Ok(Self {
+            cgroup,
+            config: Default::default(),
+        })
     }
 
     /// The entrypoint for the `CgroupWatcher`.
     #[tracing::instrument(skip_all)]
-    pub async fn watch<E>(
+    pub async fn watch(
         &self,
-        // These are ~dependency injected~ (fancy, I know) because this function
-        // should never return.
-        // -> therefore: when we tokio::spawn it, we don't await the JoinHandle.
-        // -> therefore: if we want to stick it in an Arc so many threads can access
-        //    it, methods can never take mutable access.
-        //     - note: we use the Arc strategy so that a) we can call this function
-        //             right here and b) the runner can call the set/get_memory methods
-        // -> since calling recv() on a tokio::sync::mpsc::Receiver takes &mut self,
-        //    we just pass them in here instead of holding them in fields, as that
-        //    would require this method to take &mut self.
-        mut upscales: mpsc::Receiver<Sequenced<Resources>>,
-        events: E,
-    ) -> anyhow::Result<()>
-    where
-        E: Stream<Item = Sequenced<u64>>,
-    {
-        let mut wait_to_freeze = pin!(tokio::time::sleep(Duration::ZERO));
-        let mut last_memory_high_increase_at: Option<Instant> = None;
-        let mut events = pin!(events);
-
-        // Are we waiting to be upscaled? Could be true if we request upscale due
-        // to a memory.high event and it does not arrive in time.
-        let mut waiting_on_upscale = false;
-
-        loop {
-            tokio::select! {
-                upscale = upscales.recv() => {
-                    let Sequenced { seqnum, data } = upscale
-                        .context("failed to listen on upscale notification channel")?;
-                    waiting_on_upscale = false;
-                    last_memory_high_increase_at = None;
-                    self.last_upscale_seqnum.store(seqnum, Ordering::Release);
-                    info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
-                }
-                event = events.next() => {
-                    let Some(Sequenced { seqnum, .. }) = event else {
-                        bail!("failed to listen for memory.high events")
-                    };
-                    // The memory.high came before our last upscale, so we consider
-                    // it resolved
-                    if self.last_upscale_seqnum.fetch_max(seqnum, Ordering::AcqRel) > seqnum {
-                        info!(
-                            "received memory.high event, but it came before our last upscale -> ignoring it"
-                        );
-                        continue;
-                    }
-
-                    // The memory.high came after our latest upscale. We don't
-                    // want to do anything yet, so peek the next event in hopes
-                    // that it's an upscale.
-                    if let Some(upscale_num) = self
-                        .upscaled(&mut upscales)
-                        .context("failed to check if we were upscaled")?
-                    {
-                        if upscale_num > seqnum {
-                            info!(
-                                "received memory.high event, but it came before our last upscale -> ignoring it"
-                            );
-                            continue;
-                        }
-                    }
-
-                    // If it's been long enough since we last froze, freeze the
-                    // cgroup and request upscale
-                    if wait_to_freeze.is_elapsed() {
-                        info!("received memory.high event -> requesting upscale");
-                        waiting_on_upscale = self
-                            .handle_memory_high_event(&mut upscales)
-                            .await
-                            .context("failed to handle upscale")?;
-                        wait_to_freeze
-                            .as_mut()
-                            .reset(Instant::now() + self.config.do_not_freeze_more_often_than);
-                        continue;
-                    }
-
-                    // Ok, we can't freeze, just request upscale
-                    if !waiting_on_upscale {
-                        info!("received memory.high event, but too soon to refreeze -> requesting upscale");
-
-                        // Make check to make sure we haven't been upscaled in the
-                        // meantine (can happen if the agent independently decides
-                        // to upscale us again)
-                        if self
-                            .upscaled(&mut upscales)
-                            .context("failed to check if we were upscaled")?
-                            .is_some()
-                        {
-                            info!("no need to request upscaling because we got upscaled");
-                            continue;
-                        }
-                        self.upscale_requester
-                            .send(())
-                            .await
-                            .context("failed to request upscale")?;
-                        waiting_on_upscale = true;
-                        continue;
-                    }
-
-                    // Shoot, we can't freeze or and we're still waiting on upscale,
-                    // increase memory.high to reduce throttling
-                    let can_increase_memory_high = match last_memory_high_increase_at {
-                        None => true,
-                        Some(t) => t.elapsed() > self.config.memory_high_increase_every,
-                    };
-                    if can_increase_memory_high {
-                        info!(
-                            "received memory.high event, \
-                            but too soon to refreeze and already requested upscale \
-                            -> increasing memory.high"
-                        );
-
-                        // Make check to make sure we haven't been upscaled in the
-                        // meantine (can happen if the agent independently decides
-                        // to upscale us again)
-                        if self
-                            .upscaled(&mut upscales)
-                            .context("failed to check if we were upscaled")?
-                            .is_some()
-                        {
-                            info!("no need to increase memory.high because got upscaled");
-                            continue;
-                        }
-
-                        // Request upscale anyways (the agent will handle deduplicating
-                        // requests)
-                        self.upscale_requester
-                            .send(())
-                            .await
-                            .context("failed to request upscale")?;
-
-                        let memory_high =
-                            self.get_memory_high_bytes().context("failed to get memory.high")?;
-                        let new_high = memory_high + self.config.memory_high_increase_by_bytes;
-                        info!(
-                            current_high_bytes = memory_high,
-                            new_high_bytes = new_high,
-                            "updating memory.high"
-                        );
-                        self.set_memory_high_bytes(new_high)
-                            .context("failed to set memory.high")?;
-                        last_memory_high_increase_at = Some(Instant::now());
-                        continue;
-                    }
-
-                    info!("received memory.high event, but can't do anything");
-                }
-            };
-        }
-    }
-
-    /// Handle a `memory.high`, returning whether we are still waiting on upscale
-    /// by the time the function returns.
-    ///
-    /// The general plan for handling a `memory.high` event is as follows:
-    /// 1. Freeze the cgroup
-    /// 2. Start a timer for `self.config.max_upscale_wait`
-    /// 3. Request upscale
-    /// 4. After the timer elapses or we receive upscale, thaw the cgroup.
-    /// 5. Return whether or not we are still waiting for upscale. If we are,
-    ///    we'll increase the cgroups memory.high to avoid getting oom killed
-    #[tracing::instrument(skip_all)]
-    async fn handle_memory_high_event(
-        &self,
-        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
-    ) -> anyhow::Result<bool> {
-        // Immediately freeze the cgroup before doing anything else.
-        info!("received memory.high event -> freezing cgroup");
-        self.freeze().context("failed to freeze cgroup")?;
-
-        // We'll use this for logging durations
-        let start_time = Instant::now();
-
-        // Await the upscale until we have to unfreeze
-        let timed =
-            tokio::time::timeout(self.config.max_upscale_wait, self.await_upscale(upscales));
-
-        // Request the upscale
-        info!(
-            wait = ?self.config.max_upscale_wait,
-            "sending request for immediate upscaling",
-        );
-        self.upscale_requester
-            .send(())
-            .await
-            .context("failed to request upscale")?;
-
-        let waiting_on_upscale = match timed.await {
-            Ok(Ok(())) => {
-                info!(elapsed = ?start_time.elapsed(), "received upscale in time");
-                false
-            }
-            // **important**: unfreeze the cgroup before ?-reporting the error
-            Ok(Err(e)) => {
-                info!("error waiting for upscale -> thawing cgroup");
-                self.thaw()
-                    .context("failed to thaw cgroup after errored waiting for upscale")?;
-                Err(e.context("failed to await upscale"))?
-            }
-            Err(_) => {
-                info!(elapsed = ?self.config.max_upscale_wait, "timed out waiting for upscale");
-                true
-            }
-        };
-
-        info!("thawing cgroup");
-        self.thaw().context("failed to thaw cgroup")?;
-
-        Ok(waiting_on_upscale)
-    }
-
-    /// Checks whether we were just upscaled, returning the upscale's sequence
-    /// number if so.
-    #[tracing::instrument(skip_all)]
-    fn upscaled(
-        &self,
-        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
-    ) -> anyhow::Result<Option<u64>> {
-        let Sequenced { seqnum, data } = match upscales.try_recv() {
-            Ok(upscale) => upscale,
-            Err(TryRecvError::Empty) => return Ok(None),
-            Err(TryRecvError::Disconnected) => {
-                bail!("upscale notification channel was disconnected")
-            }
-        };
-
-        // Make sure to update the last upscale sequence number
-        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
-        info!(cpu = data.cpu, mem_bytes = data.mem, "received upscale");
-        Ok(Some(seqnum))
-    }
-
-    /// Await an upscale event, discarding any `memory.high` events received in
-    /// the process.
-    ///
-    /// This is used in `handle_memory_high_event`, where we need to listen
-    /// for upscales in particular so we know if we can thaw the cgroup early.
-    #[tracing::instrument(skip_all)]
-    async fn await_upscale(
-        &self,
-        upscales: &mut mpsc::Receiver<Sequenced<Resources>>,
+        updates: watch::Sender<(Instant, MemoryHistory)>,
     ) -> anyhow::Result<()> {
-        let Sequenced { seqnum, .. } = upscales
-            .recv()
-            .await
-            .context("error listening for upscales")?;
+        // this requirement makes the code a bit easier to work with; see the config for more.
+        assert!(self.config.memory_history_len <= self.config.memory_history_log_interval);
 
-        self.last_upscale_seqnum.store(seqnum, Ordering::Release);
-        Ok(())
-    }
+        let mut ticker = tokio::time::interval(self.config.memory_poll_interval);
+        ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
+        // ticker.reset_immediately(); // FIXME: enable this once updating to tokio >= 1.30.0
 
-    /// Get the cgroup's name.
-    pub fn path(&self) -> &str {
-        self.cgroup.path()
-    }
-}
+        let mem_controller = self.memory()?;
 
-// Methods for manipulating the actual cgroup
-impl CgroupWatcher {
-    /// Get a handle on the freezer subsystem.
-    fn freezer(&self) -> anyhow::Result<&FreezerController> {
-        if let Some(Freezer(freezer)) = self
-            .cgroup
-            .subsystems()
-            .iter()
-            .find(|sub| matches!(sub, Freezer(_)))
-        {
-            Ok(freezer)
-        } else {
-            anyhow::bail!("could not find freezer subsystem")
+        // buffer for samples that will be logged. once full, it remains so.
+        let history_log_len = self.config.memory_history_log_interval;
+        let mut history_log_buf = vec![MemoryStatus::zeroed(); history_log_len];
+
+        for t in 0_u64.. {
+            ticker.tick().await;
+
+            let now = Instant::now();
+            let mem = Self::memory_usage(mem_controller);
+
+            let i = t as usize % history_log_len;
+            history_log_buf[i] = mem;
+
+            // We're taking *at most* memory_history_len values; we may be bounded by the total
+            // number of samples that have come in so far.
+            let samples_count = (t + 1).min(self.config.memory_history_len as u64) as usize;
+            // NB: in `ring_buf_recent_values_iter`, `i` is *inclusive*, which matches the fact
+            // that we just inserted a value there, so the end of the iterator will *include* the
+            // value at i, rather than stopping just short of it.
+            let samples = ring_buf_recent_values_iter(&history_log_buf, i, samples_count);
+
+            let summary = MemoryHistory {
+                avg_non_reclaimable: samples.map(|h| h.non_reclaimable).sum::<u64>()
+                    / samples_count as u64,
+                samples_count,
+                samples_span: self.config.memory_poll_interval * (samples_count - 1) as u32,
+            };
+
+            // Log the current history if it's time to do so. Because `history_log_buf` has length
+            // equal to the logging interval, we can just log the entire buffer every time we set
+            // the last entry, which also means that for this log line, we can ignore that it's a
+            // ring buffer (because all the entries are in order of increasing time).
+            if i == history_log_len - 1 {
+                info!(
+                    history = ?MemoryStatus::debug_slice(&history_log_buf),
+                    summary = ?summary,
+                    "Recent cgroup memory statistics history"
+                );
+            }
+
+            updates
+                .send((now, summary))
+                .context("failed to send MemoryHistory")?;
         }
-    }
 
-    /// Attempt to freeze the cgroup.
-    pub fn freeze(&self) -> anyhow::Result<()> {
-        self.freezer()
-            .context("failed to get freezer subsystem")?
-            .freeze()
-            .context("failed to freeze")
-    }
-
-    /// Attempt to thaw the cgroup.
-    pub fn thaw(&self) -> anyhow::Result<()> {
-        self.freezer()
-            .context("failed to get freezer subsystem")?
-            .thaw()
-            .context("failed to thaw")
+        unreachable!()
     }
 
     /// Get a handle on the memory subsystem.
-    ///
-    /// Note: this method does not require `self.memory_update_lock` because
-    /// getting a handle to the subsystem does not access any of the files we
-    /// care about, such as memory.high and memory.events
     fn memory(&self) -> anyhow::Result<&MemController> {
-        if let Some(Mem(memory)) = self
-            .cgroup
+        self.cgroup
             .subsystems()
             .iter()
-            .find(|sub| matches!(sub, Mem(_)))
-        {
-            Ok(memory)
-        } else {
-            anyhow::bail!("could not find memory subsystem")
-        }
-    }
-
-    /// Get cgroup current memory usage.
-    pub fn current_memory_usage(&self) -> anyhow::Result<u64> {
-        Ok(self
-            .memory()
-            .context("failed to get memory subsystem")?
-            .memory_stat()
-            .usage_in_bytes)
-    }
-
-    /// Set cgroup memory.high threshold.
-    pub fn set_memory_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
-        self.set_memory_high_internal(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64))
-    }
-
-    /// Set the cgroup's memory.high to 'max', disabling it.
-    pub fn unset_memory_high(&self) -> anyhow::Result<()> {
-        self.set_memory_high_internal(MaxValue::Max)
-    }
-
-    fn set_memory_high_internal(&self, value: MaxValue) -> anyhow::Result<()> {
-        self.memory()
-            .context("failed to get memory subsystem")?
-            .set_mem(cgroups_rs::memory::SetMemory {
-                low: None,
-                high: Some(value),
-                min: None,
-                max: None,
+            .find_map(|sub| match sub {
+                Subsystem::Mem(c) => Some(c),
+                _ => None,
             })
-            .map_err(anyhow::Error::from)
+            .ok_or_else(|| anyhow!("could not find memory subsystem"))
     }
 
-    /// Get memory.high threshold.
-    pub fn get_memory_high_bytes(&self) -> anyhow::Result<u64> {
-        let high = self
-            .memory()
-            .context("failed to get memory subsystem while getting memory statistics")?
-            .get_mem()
-            .map(|mem| mem.high)
-            .context("failed to get memory statistics from subsystem")?;
-        match high {
-            Some(MaxValue::Max) => Ok(i64::MAX as u64),
-            Some(MaxValue::Value(high)) => Ok(high as u64),
-            None => anyhow::bail!("failed to read memory.high from memory subsystem"),
+    /// Given a handle on the memory subsystem, returns the current memory information
+    fn memory_usage(mem_controller: &MemController) -> MemoryStatus {
+        let stat = mem_controller.memory_stat().stat;
+        MemoryStatus {
+            non_reclaimable: stat.active_anon + stat.inactive_anon,
         }
     }
 }
+
+// Helper function for `CgroupWatcher::watch`
+fn ring_buf_recent_values_iter<T>(
+    buf: &[T],
+    last_value_idx: usize,
+    count: usize,
+) -> impl '_ + Iterator<Item = &T> {
+    // Assertion carried over from `CgroupWatcher::watch`, to make the logic in this function
+    // easier (we only have to add `buf.len()` once, rather than a dynamic number of times).
+    assert!(count <= buf.len());
+
+    buf.iter()
+        // 'cycle' because the values could wrap around
+        .cycle()
+        // with 'cycle', this skip is more like 'offset', and functionally this is
+        // offsettting by 'last_value_idx - count (mod buf.len())', but we have to be
+        // careful to avoid underflow, so we pre-add buf.len().
+        // The '+ 1' is because `last_value_idx` is inclusive, rather than exclusive.
+        .skip((buf.len() + last_value_idx + 1 - count) % buf.len())
+        .take(count)
+}
+
+/// Summary of recent memory usage
+#[derive(Debug, Copy, Clone)]
+pub struct MemoryHistory {
+    /// Rolling average of non-reclaimable memory usage samples over the last `history_period`
+    pub avg_non_reclaimable: u64,
+
+    /// The number of samples used to construct this summary
+    pub samples_count: usize,
+    /// Total timespan between the first and last sample used for this summary
+    pub samples_span: Duration,
+}
+
+#[derive(Debug, Copy, Clone)]
+pub struct MemoryStatus {
+    non_reclaimable: u64,
+}
+
+impl MemoryStatus {
+    fn zeroed() -> Self {
+        MemoryStatus { non_reclaimable: 0 }
+    }
+
+    fn debug_slice(slice: &[Self]) -> impl '_ + Debug {
+        struct DS<'a>(&'a [MemoryStatus]);
+
+        impl<'a> Debug for DS<'a> {
+            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+                f.debug_struct("[MemoryStatus]")
+                    .field(
+                        "non_reclaimable[..]",
+                        &Fields(self.0, |stat: &MemoryStatus| {
+                            BytesToGB(stat.non_reclaimable)
+                        }),
+                    )
+                    .finish()
+            }
+        }
+
+        struct Fields<'a, F>(&'a [MemoryStatus], F);
+
+        impl<'a, F: Fn(&MemoryStatus) -> T, T: Debug> Debug for Fields<'a, F> {
+            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+                f.debug_list().entries(self.0.iter().map(&self.1)).finish()
+            }
+        }
+
+        struct BytesToGB(u64);
+
+        impl Debug for BytesToGB {
+            fn fmt(&self, f: &mut Formatter) -> fmt::Result {
+                f.write_fmt(format_args!(
+                    "{:.3}Gi",
+                    self.0 as f64 / (1_u64 << 30) as f64
+                ))
+            }
+        }
+
+        DS(slice)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    #[test]
+    fn ring_buf_iter() {
+        let buf = vec![0_i32, 1, 2, 3, 4, 5, 6, 7, 8, 9];
+
+        let values = |offset, count| {
+            super::ring_buf_recent_values_iter(&buf, offset, count)
+                .copied()
+                .collect::<Vec<i32>>()
+        };
+
+        // Boundary conditions: start, end, and entire thing:
+        assert_eq!(values(0, 1), [0]);
+        assert_eq!(values(3, 4), [0, 1, 2, 3]);
+        assert_eq!(values(9, 4), [6, 7, 8, 9]);
+        assert_eq!(values(9, 10), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]);
+
+        // "normal" operation: no wraparound
+        assert_eq!(values(7, 4), [4, 5, 6, 7]);
+
+        // wraparound:
+        assert_eq!(values(0, 4), [7, 8, 9, 0]);
+        assert_eq!(values(1, 4), [8, 9, 0, 1]);
+        assert_eq!(values(2, 4), [9, 0, 1, 2]);
+        assert_eq!(values(2, 10), [3, 4, 5, 6, 7, 8, 9, 0, 1, 2]);
+    }
+}
diff --git a/libs/vm_monitor/src/dispatcher.rs b/libs/vm_monitor/src/dispatcher.rs
index 109a68fff1..c76baf04e7 100644
--- a/libs/vm_monitor/src/dispatcher.rs
+++ b/libs/vm_monitor/src/dispatcher.rs
@@ -12,12 +12,10 @@ use futures::{
     stream::{SplitSink, SplitStream},
     SinkExt, StreamExt,
 };
-use tokio::sync::mpsc;
 use tracing::info;
 
-use crate::cgroup::Sequenced;
 use crate::protocol::{
-    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, Resources, PROTOCOL_MAX_VERSION,
+    OutboundMsg, ProtocolRange, ProtocolResponse, ProtocolVersion, PROTOCOL_MAX_VERSION,
     PROTOCOL_MIN_VERSION,
 };
 
@@ -36,13 +34,6 @@ pub struct Dispatcher {
     /// We send messages to the agent through `sink`
     sink: SplitSink<WebSocket, Message>,
 
-    /// Used to notify the cgroup when we are upscaled.
-    pub(crate) notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
-
-    /// When the cgroup requests upscale it will send on this channel. In response
-    /// we send an `UpscaleRequst` to the agent.
-    pub(crate) request_upscale_events: mpsc::Receiver<()>,
-
     /// The protocol version we have agreed to use with the agent. This is negotiated
     /// during the creation of the dispatcher, and should be the highest shared protocol
     /// version.
@@ -61,11 +52,7 @@ impl Dispatcher {
     /// 1. Wait for the agent to sent the range of protocols it supports.
     /// 2. Send a protocol version that works for us as well, or an error if there
     ///    is no compatible version.
-    pub async fn new(
-        stream: WebSocket,
-        notify_upscale_events: mpsc::Sender<Sequenced<Resources>>,
-        request_upscale_events: mpsc::Receiver<()>,
-    ) -> anyhow::Result<Self> {
+    pub async fn new(stream: WebSocket) -> anyhow::Result<Self> {
         let (mut sink, mut source) = stream.split();
 
         // Figure out the highest protocol version we both support
@@ -119,22 +106,10 @@ impl Dispatcher {
         Ok(Self {
             sink,
             source,
-            notify_upscale_events,
-            request_upscale_events,
             proto_version: highest_shared_version,
         })
     }
 
-    /// Notify the cgroup manager that we have received upscale and wait for
-    /// the acknowledgement.
-    #[tracing::instrument(skip_all, fields(?resources))]
-    pub async fn notify_upscale(&self, resources: Sequenced<Resources>) -> anyhow::Result<()> {
-        self.notify_upscale_events
-            .send(resources)
-            .await
-            .context("failed to send resources and oneshot sender across channel")
-    }
-
     /// Send a message to the agent.
     ///
     /// Although this function is small, it has one major benefit: it is the only
diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs
index b0ee5f0310..a7a0995797 100644
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -5,18 +5,16 @@
 //! all functionality.
 
 use std::fmt::Debug;
-use std::sync::Arc;
 use std::time::{Duration, Instant};
 
 use anyhow::{bail, Context};
 use axum::extract::ws::{Message, WebSocket};
 use futures::StreamExt;
-use tokio::sync::broadcast;
-use tokio::sync::mpsc;
+use tokio::sync::{broadcast, watch};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};
 
-use crate::cgroup::{CgroupWatcher, Sequenced};
+use crate::cgroup::{self, CgroupWatcher};
 use crate::dispatcher::Dispatcher;
 use crate::filecache::{FileCacheConfig, FileCacheState};
 use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
@@ -28,7 +26,7 @@ use crate::{bytes_to_mebibytes, get_total_system_memory, spawn_with_cancel, Args
 pub struct Runner {
     config: Config,
     filecache: Option<FileCacheState>,
-    cgroup: Option<Arc<CgroupWatcher>>,
+    cgroup: Option<CgroupState>,
     dispatcher: Dispatcher,
 
     /// We "mint" new message ids by incrementing this counter and taking the value.
@@ -45,6 +43,14 @@ pub struct Runner {
     kill: broadcast::Receiver<()>,
 }
 
+#[derive(Debug)]
+struct CgroupState {
+    watcher: watch::Receiver<(Instant, cgroup::MemoryHistory)>,
+    /// If [`cgroup::MemoryHistory::avg_non_reclaimable`] exceeds `threshold`, we send upscale
+    /// requests.
+    threshold: u64,
+}
+
 /// Configuration for a `Runner`
 #[derive(Debug)]
 pub struct Config {
@@ -62,16 +68,56 @@ pub struct Config {
     /// upscale resource amounts (because we might not *actually* have been upscaled yet). This field
     /// should be removed once we have a better solution there.
     sys_buffer_bytes: u64,
+
+    /// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
+    /// other words, providing a ceiling for the highest value of the threshold by enforcing that
+    /// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
+    /// threshold.
+    ///
+    /// For example, a value of `0.1` means that 10% of total memory must remain after exceeding
+    /// the threshold, so the value of the cgroup threshold would always be capped at 90% of total
+    /// memory.
+    ///
+    /// The default value of `0.15` means that we *guarantee* sending upscale requests if the
+    /// cgroup is using more than 85% of total memory (even if we're *not* separately reserving
+    /// memory for the file cache).
+    cgroup_min_overhead_fraction: f64,
+
+    cgroup_downscale_threshold_buffer_bytes: u64,
 }
 
 impl Default for Config {
     fn default() -> Self {
         Self {
             sys_buffer_bytes: 100 * MiB,
+            cgroup_min_overhead_fraction: 0.15,
+            cgroup_downscale_threshold_buffer_bytes: 100 * MiB,
         }
     }
 }
 
+impl Config {
+    fn cgroup_threshold(&self, total_mem: u64, file_cache_disk_size: u64) -> u64 {
+        // If the file cache is in tmpfs, then it will count towards shmem usage of the cgroup,
+        // and thus be non-reclaimable, so we should allow for additional memory usage.
+        //
+        // If the file cache sits on disk, our desired stable system state is for it to be fully
+        // page cached (its contents should only be paged to/from disk in situations where we can't
+        // upscale fast enough). Page-cached memory is reclaimable, so we need to lower the
+        // threshold for non-reclaimable memory so we scale up *before* the kernel starts paging
+        // out the file cache.
+        let memory_remaining_for_cgroup = total_mem.saturating_sub(file_cache_disk_size);
+
+        // Even if we're not separately making room for the file cache (if it's in tmpfs), we still
+        // want our threshold to be met gracefully instead of letting postgres get OOM-killed.
+        // So we guarantee that there's at least `cgroup_min_overhead_fraction` of total memory
+        // remaining above the threshold.
+        let max_threshold = (total_mem as f64 * (1.0 - self.cgroup_min_overhead_fraction)) as u64;
+
+        memory_remaining_for_cgroup.min(max_threshold)
+    }
+}
+
 impl Runner {
     /// Create a new monitor.
     #[tracing::instrument(skip_all, fields(?config, ?args))]
@@ -87,12 +133,7 @@ impl Runner {
             "invalid monitor Config: sys_buffer_bytes cannot be 0"
         );
 
-        // *NOTE*: the dispatcher and cgroup manager talk through these channels
-        // so make sure they each get the correct half, nothing is droppped, etc.
-        let (notified_send, notified_recv) = mpsc::channel(1);
-        let (requesting_send, requesting_recv) = mpsc::channel(1);
-
-        let dispatcher = Dispatcher::new(ws, notified_send, requesting_recv)
+        let dispatcher = Dispatcher::new(ws)
             .await
             .context("error creating new dispatcher")?;
 
@@ -106,46 +147,10 @@ impl Runner {
             kill,
         };
 
-        // If we have both the cgroup and file cache integrations enabled, it's possible for
-        // temporary failures to result in cgroup throttling (from memory.high), that in turn makes
-        // it near-impossible to connect to the file cache (because it times out). Unfortunately,
-        // we *do* still want to determine the file cache size before setting the cgroup's
-        // memory.high, so it's not as simple as just swapping the order.
-        //
-        // Instead, the resolution here is that on vm-monitor startup (note: happens on each
-        // connection from autoscaler-agent, possibly multiple times per compute_ctl lifecycle), we
-        // temporarily unset memory.high, to allow any existing throttling to dissipate. It's a bit
-        // of a hacky solution, but helps with reliability.
-        if let Some(name) = &args.cgroup {
-            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
-            // now, and then set limits later.
-            info!("initializing cgroup");
-
-            let (cgroup, cgroup_event_stream) = CgroupWatcher::new(name.clone(), requesting_send)
-                .context("failed to create cgroup manager")?;
-
-            info!("temporarily unsetting memory.high");
-
-            // Temporarily un-set cgroup memory.high; see above.
-            cgroup
-                .unset_memory_high()
-                .context("failed to unset memory.high")?;
-
-            let cgroup = Arc::new(cgroup);
-
-            let cgroup_clone = Arc::clone(&cgroup);
-            spawn_with_cancel(
-                token.clone(),
-                |_| error!("cgroup watcher terminated"),
-                async move { cgroup_clone.watch(notified_recv, cgroup_event_stream).await },
-            );
-
-            state.cgroup = Some(cgroup);
-        }
-
-        let mut file_cache_reserved_bytes = 0;
         let mem = get_total_system_memory();
 
+        let mut file_cache_disk_size = 0;
+
         // We need to process file cache initialization before cgroup initialization, so that the memory
         // allocated to the file cache is appropriately taken into account when we decide the cgroup's
         // memory limits.
@@ -156,7 +161,7 @@ impl Runner {
                 false => FileCacheConfig::default_in_memory(),
             };
 
-            let mut file_cache = FileCacheState::new(connstr, config, token)
+            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
                 .await
                 .context("failed to create file cache")?;
 
@@ -181,23 +186,40 @@ impl Runner {
             if actual_size != new_size {
                 info!("file cache size actually got set to {actual_size}")
             }
-            // Mark the resources given to the file cache as reserved, but only if it's in memory.
-            if !args.file_cache_on_disk {
-                file_cache_reserved_bytes = actual_size;
+
+            if args.file_cache_on_disk {
+                file_cache_disk_size = actual_size;
             }
 
             state.filecache = Some(file_cache);
         }
 
-        if let Some(cgroup) = &state.cgroup {
-            let available = mem - file_cache_reserved_bytes;
-            let value = cgroup.config.calculate_memory_high_value(available);
+        if let Some(name) = &args.cgroup {
+            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
+            // now, and then set limits later.
+            info!("initializing cgroup");
 
-            info!(value, "setting memory.high");
+            let cgroup =
+                CgroupWatcher::new(name.clone()).context("failed to create cgroup manager")?;
 
-            cgroup
-                .set_memory_high_bytes(value)
-                .context("failed to set cgroup memory.high")?;
+            let init_value = cgroup::MemoryHistory {
+                avg_non_reclaimable: 0,
+                samples_count: 0,
+                samples_span: Duration::ZERO,
+            };
+            let (hist_tx, hist_rx) = watch::channel((Instant::now(), init_value));
+
+            spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
+                cgroup.watch(hist_tx).await
+            });
+
+            let threshold = state.config.cgroup_threshold(mem, file_cache_disk_size);
+            info!(threshold, "set initial cgroup threshold",);
+
+            state.cgroup = Some(CgroupState {
+                watcher: hist_rx,
+                threshold,
+            });
         }
 
         Ok(state)
@@ -217,28 +239,40 @@ impl Runner {
 
         let requested_mem = target.mem;
         let usable_system_memory = requested_mem.saturating_sub(self.config.sys_buffer_bytes);
-        let expected_file_cache_mem_usage = self
+        let (expected_file_cache_size, expected_file_cache_disk_size) = self
             .filecache
             .as_ref()
-            .map(|file_cache| file_cache.config.calculate_cache_size(usable_system_memory))
-            .unwrap_or(0);
-        let mut new_cgroup_mem_high = 0;
+            .map(|file_cache| {
+                let size = file_cache.config.calculate_cache_size(usable_system_memory);
+                match file_cache.config.in_memory {
+                    true => (size, 0),
+                    false => (size, size),
+                }
+            })
+            .unwrap_or((0, 0));
         if let Some(cgroup) = &self.cgroup {
-            new_cgroup_mem_high = cgroup
+            let (last_time, last_history) = *cgroup.watcher.borrow();
+
+            // TODO: make the duration here configurable.
+            if last_time.elapsed() > Duration::from_secs(5) {
+                bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information");
+            } else if last_history.samples_count <= 1 {
+                bail!("haven't received enough cgroup memory stats yet");
+            }
+
+            let new_threshold = self
                 .config
-                .calculate_memory_high_value(usable_system_memory - expected_file_cache_mem_usage);
+                .cgroup_threshold(usable_system_memory, expected_file_cache_disk_size);
 
-            let current = cgroup
-                .current_memory_usage()
-                .context("failed to fetch cgroup memory")?;
+            let current = last_history.avg_non_reclaimable;
 
-            if new_cgroup_mem_high < current + cgroup.config.memory_high_buffer_bytes {
+            if new_threshold < current + self.config.cgroup_downscale_threshold_buffer_bytes {
                 let status = format!(
-                    "{}: {} MiB (new high) < {} (current usage) + {} (buffer)",
-                    "calculated memory.high too low",
-                    bytes_to_mebibytes(new_cgroup_mem_high),
+                    "{}: {} MiB (new threshold) < {} (current usage) + {} (downscale buffer)",
+                    "calculated memory threshold too low",
+                    bytes_to_mebibytes(new_threshold),
                     bytes_to_mebibytes(current),
-                    bytes_to_mebibytes(cgroup.config.memory_high_buffer_bytes)
+                    bytes_to_mebibytes(self.config.cgroup_downscale_threshold_buffer_bytes)
                 );
 
                 info!(status, "discontinuing downscale");
@@ -249,14 +283,14 @@ impl Runner {
 
         // The downscaling has been approved. Downscale the file cache, then the cgroup.
         let mut status = vec![];
-        let mut file_cache_mem_usage = 0;
+        let mut file_cache_disk_size = 0;
         if let Some(file_cache) = &mut self.filecache {
             let actual_usage = file_cache
-                .set_file_cache_size(expected_file_cache_mem_usage)
+                .set_file_cache_size(expected_file_cache_size)
                 .await
                 .context("failed to set file cache size")?;
-            if file_cache.config.in_memory {
-                file_cache_mem_usage = actual_usage;
+            if !file_cache.config.in_memory {
+                file_cache_disk_size = actual_usage;
             }
             let message = format!(
                 "set file cache size to {} MiB (in memory = {})",
@@ -267,24 +301,18 @@ impl Runner {
             status.push(message);
         }
 
-        if let Some(cgroup) = &self.cgroup {
-            let available_memory = usable_system_memory - file_cache_mem_usage;
-
-            if file_cache_mem_usage != expected_file_cache_mem_usage {
-                new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
-            }
-
-            // new_cgroup_mem_high is initialized to 0 but it is guaranteed to not be here
-            // since it is properly initialized in the previous cgroup if let block
-            cgroup
-                .set_memory_high_bytes(new_cgroup_mem_high)
-                .context("failed to set cgroup memory.high")?;
+        if let Some(cgroup) = &mut self.cgroup {
+            let new_threshold = self
+                .config
+                .cgroup_threshold(usable_system_memory, file_cache_disk_size);
 
             let message = format!(
-                "set cgroup memory.high to {} MiB, of new max {} MiB",
-                bytes_to_mebibytes(new_cgroup_mem_high),
-                bytes_to_mebibytes(available_memory)
+                "set cgroup memory threshold from {} MiB to {} MiB, of new total {} MiB",
+                bytes_to_mebibytes(cgroup.threshold),
+                bytes_to_mebibytes(new_threshold),
+                bytes_to_mebibytes(usable_system_memory)
             );
+            cgroup.threshold = new_threshold;
             info!("downscale: {message}");
             status.push(message);
         }
@@ -305,8 +333,7 @@ impl Runner {
         let new_mem = resources.mem;
         let usable_system_memory = new_mem.saturating_sub(self.config.sys_buffer_bytes);
 
-        // Get the file cache's expected contribution to the memory usage
-        let mut file_cache_mem_usage = 0;
+        let mut file_cache_disk_size = 0;
         if let Some(file_cache) = &mut self.filecache {
             let expected_usage = file_cache.config.calculate_cache_size(usable_system_memory);
             info!(
@@ -319,8 +346,8 @@ impl Runner {
                 .set_file_cache_size(expected_usage)
                 .await
                 .context("failed to set file cache size")?;
-            if file_cache.config.in_memory {
-                file_cache_mem_usage = actual_usage;
+            if !file_cache.config.in_memory {
+                file_cache_disk_size = actual_usage;
             }
 
             if actual_usage != expected_usage {
@@ -332,18 +359,18 @@ impl Runner {
             }
         }
 
-        if let Some(cgroup) = &self.cgroup {
-            let available_memory = usable_system_memory - file_cache_mem_usage;
-            let new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
+        if let Some(cgroup) = &mut self.cgroup {
+            let new_threshold = self
+                .config
+                .cgroup_threshold(usable_system_memory, file_cache_disk_size);
+
             info!(
-                target = bytes_to_mebibytes(new_cgroup_mem_high),
-                total = bytes_to_mebibytes(new_mem),
-                name = cgroup.path(),
-                "updating cgroup memory.high",
+                "set cgroup memory threshold from {} MiB to {} MiB of new total {} MiB",
+                bytes_to_mebibytes(cgroup.threshold),
+                bytes_to_mebibytes(new_threshold),
+                bytes_to_mebibytes(usable_system_memory)
             );
-            cgroup
-                .set_memory_high_bytes(new_cgroup_mem_high)
-                .context("failed to set cgroup memory.high")?;
+            cgroup.threshold = new_threshold;
         }
 
         Ok(())
@@ -361,10 +388,6 @@ impl Runner {
                 self.handle_upscale(granted)
                     .await
                     .context("failed to handle upscale")?;
-                self.dispatcher
-                    .notify_upscale(Sequenced::new(granted))
-                    .await
-                    .context("failed to notify notify cgroup of upscale")?;
                 Ok(Some(OutboundMsg::new(
                     OutboundMsgKind::UpscaleConfirmation {},
                     id,
@@ -408,33 +431,53 @@ impl Runner {
                         Err(e) => bail!("failed to receive kill signal: {e}")
                     }
                 }
-                // we need to propagate an upscale request
-                request = self.dispatcher.request_upscale_events.recv(), if self.cgroup.is_some() => {
-                    if request.is_none() {
-                        bail!("failed to listen for upscale event from cgroup")
+
+                // New memory stats from the cgroup, *may* need to request upscaling, if we've
+                // exceeded the threshold
+                result = self.cgroup.as_mut().unwrap().watcher.changed(), if self.cgroup.is_some() => {
+                    result.context("failed to receive from cgroup memory stats watcher")?;
+
+                    let cgroup = self.cgroup.as_ref().unwrap();
+
+                    let (_time, cgroup_mem_stat) = *cgroup.watcher.borrow();
+
+                    // If we haven't exceeded the threshold, then we're all ok
+                    if cgroup_mem_stat.avg_non_reclaimable < cgroup.threshold {
+                        continue;
                     }
 
-                    // If it's been less than 1 second since the last time we requested upscaling,
-                    // ignore the event, to avoid spamming the agent (otherwise, this can happen
-                    // ~1k times per second).
+                    // Otherwise, we generally want upscaling. But, if it's been less than 1 second
+                    // since the last time we requested upscaling, ignore the event, to avoid
+                    // spamming the agent.
                     if let Some(t) = self.last_upscale_request_at {
                         let elapsed = t.elapsed();
                         if elapsed < Duration::from_secs(1) {
-                            info!(elapsed_millis = elapsed.as_millis(), "cgroup asked for upscale but too soon to forward the request, ignoring");
+                            info!(
+                                elapsed_millis = elapsed.as_millis(),
+                                avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
+                                threshold = bytes_to_mebibytes(cgroup.threshold),
+                                "cgroup memory stats are high enough to upscale but too soon to forward the request, ignoring",
+                            );
                             continue;
                         }
                     }
 
                     self.last_upscale_request_at = Some(Instant::now());
 
-                    info!("cgroup asking for upscale; forwarding request");
+                    info!(
+                        avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
+                        threshold = bytes_to_mebibytes(cgroup.threshold),
+                        "cgroup memory stats are high enough to upscale, requesting upscale",
+                    );
+
                     self.counter += 2; // Increment, preserving parity (i.e. keep the
                                        // counter odd). See the field comment for more.
                     self.dispatcher
                         .send(OutboundMsg::new(OutboundMsgKind::UpscaleRequest {}, self.counter))
                         .await
                         .context("failed to send message")?;
-                }
+                },
+
                 // there is a message from the agent
                 msg = self.dispatcher.source.next() => {
                     if let Some(msg) = msg {

From 8a316b127711cdf5777f20547b15583be0b6499a Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Thu, 19 Oct 2023 09:10:33 -0700
Subject: [PATCH 082/209] vm-monitor: Log full error on message handling
 failure (#5604)

There's currently an issue with the vm-monitor on staging that's not
really feasible to debug because the current display impl gives no
context to the errors (just says "failed to downscale").

Logging the full error should help.

For communications with the autoscaler-agent, it's ok to only provide
the outermost cause, because we can cross-reference with the VM logs.
At some point in the future, we may want to change that.
---
 libs/vm_monitor/src/runner.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs
index a7a0995797..e818292196 100644
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -505,11 +505,14 @@ impl Runner {
                                     Ok(Some(out)) => out,
                                     Ok(None) => continue,
                                     Err(e) => {
-                                        let error = e.to_string();
-                                        warn!(?error, "error handling message");
+                                        // use {:#} for our logging because the display impl only
+                                        // gives the outermost cause, and the debug impl
+                                        // pretty-prints the error, whereas {:#} contains all the
+                                        // causes, but is compact (no newlines).
+                                        warn!(error = format!("{e:#}"), "error handling message");
                                         OutboundMsg::new(
                                             OutboundMsgKind::InternalError {
-                                                error
+                                                error: e.to_string(),
                                             },
                                             message.id
                                         )

From 850db4cc139d625a2097afa7d464d5fc66c0dfd2 Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Thu, 19 Oct 2023 11:09:37 -0700
Subject: [PATCH 083/209] vm-monitor: Deny not fail downscale if no memory
 stats yet (#5606)

Fixes an issue we observed on staging that happens when the
autoscaler-agent attempts to immediately downscale the VM after binding,
which is typical for pooled computes.

The issue was occurring because the autoscaler-agent was requesting
downscaling before the vm-monitor had gathered sufficient cgroup memory
stats to be confident in approving it. When the vm-monitor returned an
internal error instead of denying downscaling, the autoscaler-agent
retried the connection and immediately hit the same issue (in part
because cgroup stats are collected per-connection, rather than
globally).
---
 libs/vm_monitor/src/runner.rs | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/libs/vm_monitor/src/runner.rs b/libs/vm_monitor/src/runner.rs
index e818292196..99bea6d7ca 100644
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -253,11 +253,22 @@ impl Runner {
         if let Some(cgroup) = &self.cgroup {
             let (last_time, last_history) = *cgroup.watcher.borrow();
 
+            // NB: The ordering of these conditions is intentional. During startup, we should deny
+            // downscaling until we have enough information to determine that it's safe to do so
+            // (i.e. enough samples have come in). But if it's been a while and we *still* haven't
+            // received any information, we should *fail* instead of just denying downscaling.
+            //
+            // `last_time` is set to `Instant::now()` on startup, so checking `last_time.elapsed()`
+            // serves double-duty: it trips if we haven't received *any* metrics for long enough,
+            // OR if we haven't received metrics *recently enough*.
+            //
             // TODO: make the duration here configurable.
             if last_time.elapsed() > Duration::from_secs(5) {
                 bail!("haven't gotten cgroup memory stats recently enough to determine downscaling information");
             } else if last_history.samples_count <= 1 {
-                bail!("haven't received enough cgroup memory stats yet");
+                let status = "haven't received enough cgroup memory stats yet";
+                info!(status, "discontinuing downscale");
+                return Ok((false, status.to_owned()));
             }
 
             let new_threshold = self

From 0d80d6ce1873b067d608acc8942ba9ef909f8c08 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 25 Oct 2023 20:35:23 +0300
Subject: [PATCH 084/209] Ignore missed AUX_FILES_KEY when generating image
 layer (#5660)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

Logical replication requires new AUX_FILES_KEY which is definitely
absent in existed database.
We do not have function to check if key exists in our KV storage.
So I have to handle the error in `list_aux_files` method.
But this key is also included in key space range and accessed y
`create_image_layer` method.

## Summary of changes

Check if AUX_FILES_KEY  exists before including it in keyspace.

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Shany Pozin <shany@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 pageserver/src/pgdatadir_mapping.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index d27c8a3d5d..52c44746dc 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -675,8 +675,9 @@ impl Timeline {
 
         result.add_key(CONTROLFILE_KEY);
         result.add_key(CHECKPOINT_KEY);
-        result.add_key(AUX_FILES_KEY);
-
+        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
+            result.add_key(AUX_FILES_KEY);
+        }
         Ok(result.to_keyspace())
     }
 

From 4b3b37b91250fe5732ec98cf9a64bd59db678d8b Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Tue, 24 Oct 2023 16:04:28 -0700
Subject: [PATCH 085/209] Bump vm-builder v0.18.1 -> v0.18.2 (#5646)

Only applicable change was neondatabase/autoscaling#571, removing the
postgres_exporter flags `--auto-discover-databases` and
`--exclude-databases=...`
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 3f1c728c70..8fda08efda 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -837,7 +837,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.18.1
+      VM_BUILDER_VERSION: v0.18.2
 
     steps:
       - name: Checkout

From 7d384d6953197c8b62e5eade00979a824f044d6e Mon Sep 17 00:00:00 2001
From: Em Sharnoff <sharnoff@neon.tech>
Date: Thu, 26 Oct 2023 12:04:57 -0700
Subject: [PATCH 086/209] Bump vm-builder v0.18.2 -> v0.18.4 (#5666)

Only applicable change was neondatabase/autoscaling#584, setting
pgbouncer auth_dbname=postgres in order to fix superuser connections
from preventing dropping databases.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 8fda08efda..6b487c0789 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -837,7 +837,7 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_BUILDER_VERSION: v0.18.2
+      VM_BUILDER_VERSION: v0.18.5
 
     steps:
       - name: Checkout

From 48b845fa769073e232ab39b09d386ba251fcb8a3 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Mon, 27 Nov 2023 21:45:41 +0000
Subject: [PATCH 087/209] Make neon extension relocatable to allow SET SCHEMA
 (#5942)

---
 pgxn/neon/neon.control | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pgxn/neon/neon.control b/pgxn/neon/neon.control
index c110437c3e..4e4cb9f372 100644
--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -2,3 +2,4 @@
 comment = 'cloud storage for PostgreSQL'
 default_version = '1.1'
 module_pathname = '$libdir/neon'
+relocatable = true

From e81fc598f416a2620ae8e942811e545495de490d Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Mon, 27 Nov 2023 23:29:24 +0000
Subject: [PATCH 088/209] Update neon extension relocatable for existing
 installations (#5943)

---
 compute_tools/src/spec.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 97aa144c79..8c44c6d519 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -687,6 +687,9 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
     info!("create neon extension with query: {}", query);
     client.simple_query(query)?;
 
+    query = "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'";
+    client.simple_query(query)?;
+
     query = "ALTER EXTENSION neon SET SCHEMA neon";
     info!("alter neon extension schema with query: {}", query);
     client.simple_query(query)?;

From 5d71601ca9e92c9411ca751a6373e03dd64f03dc Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 29 Nov 2023 11:36:23 +0300
Subject: [PATCH 089/209] Notify safekeeper readiness with systemd.

To avoid downtime during deploy, as in busy regions initial load can currently
take ~30s.
---
 Cargo.lock                       | 7 +++++++
 Cargo.toml                       | 1 +
 safekeeper/Cargo.toml            | 1 +
 safekeeper/src/bin/safekeeper.rs | 7 +++++++
 4 files changed, 16 insertions(+)

diff --git a/Cargo.lock b/Cargo.lock
index a75209b8db..48c7bf1795 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4137,6 +4137,7 @@ dependencies = [
  "reqwest",
  "safekeeper_api",
  "scopeguard",
+ "sd-notify",
  "serde",
  "serde_json",
  "serde_with",
@@ -4199,6 +4200,12 @@ dependencies = [
  "untrusted",
 ]
 
+[[package]]
+name = "sd-notify"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "621e3680f3e07db4c9c2c3fb07c6223ab2fab2e54bd3c04c3ae037990f428c32"
+
 [[package]]
 name = "security-framework"
 version = "2.9.1"
diff --git a/Cargo.toml b/Cargo.toml
index c50fb7be42..6df48ffc55 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -122,6 +122,7 @@ rustls-pemfile = "1"
 rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
+sd-notify = "0.4.1"
 sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 30516c0763..53fcd5ff07 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -46,6 +46,7 @@ postgres_ffi.workspace = true
 pq_proto.workspace = true
 remote_storage.workspace = true
 safekeeper_api.workspace = true
+sd-notify.workspace = true
 storage_broker.workspace = true
 tokio-stream.workspace = true
 utils.workspace = true
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 0b5bb22c8b..e59deb9fda 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -8,6 +8,7 @@ use futures::future::BoxFuture;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt, StreamExt};
 use remote_storage::RemoteStorageConfig;
+use sd_notify::NotifyState;
 use tokio::runtime::Handle;
 use tokio::signal::unix::{signal, SignalKind};
 use tokio::task::JoinError;
@@ -434,6 +435,12 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
     let mut sigint_stream = signal(SignalKind::interrupt())?;
     let mut sigterm_stream = signal(SignalKind::terminate())?;
 
+    // Notify systemd that we are ready. This is important as currently loading
+    // timelines takes significant time (~30s in busy regions).
+    if let Err(e) = sd_notify::notify(true, &[NotifyState::Ready]) {
+        warn!("systemd notify failed: {:?}", e);
+    }
+
     tokio::select! {
         Some((task_name, res)) = tasks_handles.next()=> {
             error!("{} task failed: {:?}, exiting", task_name, res);

From 40087b8164183d18da670657cc9aac042b23f769 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 1 Dec 2023 12:43:09 +0000
Subject: [PATCH 090/209] fix: use create_new instead of create for mutex file

---
 pageserver/src/tenant.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 249a9a80c5..7384459ab5 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3147,7 +3147,10 @@ impl Tenant {
         let uninit_mark_path = self
             .conf
             .timeline_uninit_mark_file_path(tenant_id, timeline_id);
-        fs::File::create(&uninit_mark_path)
+        fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .open(&uninit_mark_path)
             .context("Failed to create uninit mark file")
             .and_then(|_| {
                 crashsafe::fsync_file_and_parent(&uninit_mark_path)

From 558394f7103f33da8cfb36c7486ae5872bd4aa34 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 4 Dec 2023 11:41:27 +0200
Subject: [PATCH 091/209] fix merge

---
 Cargo.lock | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2014c0498f..5639665758 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4363,12 +4363,6 @@ dependencies = [
  "zeroize",
 ]
 
-[[package]]
-name = "sd-notify"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "621e3680f3e07db4c9c2c3fb07c6223ab2fab2e54bd3c04c3ae037990f428c32"
-
 [[package]]
 name = "security-framework"
 version = "2.9.1"

From 45c51227542b99cb5c4e6068e0ea34e61cfe3f19 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Mon, 4 Dec 2023 12:36:19 -0800
Subject: [PATCH 092/209] Remove trusted from wal2json

---
 Dockerfile.compute-node | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 425f163e8b..a3772265c0 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -721,8 +721,7 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
     echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
     mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/wal2json.control
+    make -j $(getconf _NPROCESSORS_ONLN) install
 
 #########################################################################################
 #

From 661fc41e716c50703876eba0f212e96ef8ae273f Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Wed, 6 Dec 2023 16:12:36 +0100
Subject: [PATCH 093/209] Revert timescaledb for pg14 and pg15 (#6056)

```
could not start the compute node: compute is in state "failed": db error: ERROR: could not access file "$libdir/timescaledb-2.10.1": No such file or directory Caused by: ERROR: could not access file "$libdir/timescaledb-2.10.1": No such file or directory
```
---
 Dockerfile.compute-node | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index a3772265c0..03280586f8 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -387,10 +387,20 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ARG PG_VERSION
 ENV PATH "/usr/local/pgsql/bin:$PATH"
 
-RUN apt-get update && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export TIMESCALEDB_VERSION=2.10.1 \
+        export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
+        ;; \
+      *) \
+        export TIMESCALEDB_VERSION=2.13.0 \
+        export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
+        ;; \
+    esac && \
+    apt-get update && \
     apt-get install -y cmake && \
-    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.13.0.tar.gz -O timescaledb.tar.gz && \
-    echo "584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d timescaledb.tar.gz" | sha256sum --check && \
+    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
+    echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
     mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
     ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
     cd build && \

From 04b82c92a72de06299ca1d73e452138f53844918 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 11 Dec 2023 23:27:53 +0200
Subject: [PATCH 094/209] fix: accidential return Ok (#6106)

Error indicating request cancellation OR timeline shutdown was deemed as
a reason to exit the background worker that calculated synthetic size.
Fix it to only be considered for avoiding logging such of such errors.

This conflicted on tenant_shard_id having already replaced tenant_id on
`main`.
---
 pageserver/src/consumption_metrics.rs | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 7ad6a0f890..bfd7897b49 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -281,12 +281,18 @@ async fn calculate_synthetic_size_worker(
                 // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
                 // which turns out is really handy to understand the system.
                 if let Err(e) = tenant.calculate_synthetic_size(cause, cancel, ctx).await {
-                    if let Some(PageReconstructError::Cancelled) =
-                        e.downcast_ref::<PageReconstructError>()
-                    {
-                        return Ok(());
+                    // this error can be returned if timeline is shutting down, but it does not
+                    // mean the synthetic size worker should terminate. we do not need any checks
+                    // in this function because `mgr::get_tenant` will error out after shutdown has
+                    // progressed to shutting down tenants.
+                    let is_cancelled = matches!(
+                        e.downcast_ref::<PageReconstructError>(),
+                        Some(PageReconstructError::Cancelled)
+                    );
+
+                    if !is_cancelled {
+                        error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                     }
-                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                 }
             }
         }
@@ -299,7 +305,7 @@ async fn calculate_synthetic_size_worker(
 
         let res = tokio::time::timeout_at(
             started_at + synthetic_size_calculation_interval,
-            task_mgr::shutdown_token().cancelled(),
+            cancel.cancelled(),
         )
         .await;
         if res.is_ok() {

From 8ff3253f20dcbcee24b31b65d71c23a6b30d5c43 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 11 Dec 2023 10:25:43 -0600
Subject: [PATCH 095/209] Fix git ownership issue in check-codestyle-rust-arm

We have this workaround for other jobs. Looks like this one was
forgotten about.
---
 .github/workflows/neon_extra_builds.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 0d7db8dfbc..09a106fb52 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -238,6 +238,20 @@ jobs:
       options: --init
 
     steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
+
       - name: Checkout
         uses: actions/checkout@v4
         with:

From a92702b01e1021b579c04b10bbf8ac7002939bfa Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 11 Dec 2023 10:46:41 -0600
Subject: [PATCH 096/209] Add submodule paths as safe directories as a
 precaution

The check-codestyle-rust-arm job requires this for some reason, so let's
just add them everywhere we do this workaround.
---
 .github/workflows/build_and_test.yml    | 8 ++++++++
 .github/workflows/neon_extra_builds.yml | 4 ++++
 2 files changed, 12 insertions(+)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 820848b4fb..693ed1a66f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -199,6 +199,10 @@ jobs:
           #
           git config --global --add safe.directory ${{ github.workspace }}
           git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
 
       - name: Checkout
         uses: actions/checkout@v3
@@ -1097,6 +1101,10 @@ jobs:
           #
           git config --global --add safe.directory ${{ github.workspace }}
           git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
 
       - name: Checkout
         uses: actions/checkout@v3
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 09a106fb52..b1ea5e4f74 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -142,6 +142,10 @@ jobs:
           #
           git config --global --add safe.directory ${{ github.workspace }}
           git config --global --add safe.directory ${GITHUB_WORKSPACE}
+          for r in 14 15 16; do
+            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
+            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
+          done
 
       - name: Checkout
         uses: actions/checkout@v4

From 322ea1cf7ca6426643c0c3a9a82d9b9fba538e1b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 18 Dec 2023 10:29:19 +0000
Subject: [PATCH 097/209] pageserver: on-demand activation cleanups (#6157)

## Problem

#6112 added some logs and metrics: clean these up a bit:
- Avoid counting startup completions for tenants launched after startup
- exclude no-op cases from timing histograms
- remove a rogue log messages
---
 pageserver/src/tenant.rs | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1478a1a445..eceef6bf78 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -629,9 +629,12 @@ impl Tenant {
             "attach tenant",
             false,
             async move {
+                // Is this tenant being spawned as part of process startup?
+                let starting_up = init_order.is_some();
                 scopeguard::defer! {
-                    tracing::info!("Increment complete count");
-                    TENANT.startup_complete.inc();
+                    if starting_up {
+                        TENANT.startup_complete.inc();
+                    }
                 }
 
                 // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
@@ -711,7 +714,11 @@ impl Tenant {
 
                 let preload_timer = TENANT.preload.start_timer();
                 let preload = match mode {
-                    SpawnMode::Create => {None},
+                    SpawnMode::Create => {
+                        // Don't count the skipped preload into the histogram of preload durations
+                        preload_timer.stop_and_discard();
+                        None
+                    },
                     SpawnMode::Normal => {
                         match &remote_storage {
                             Some(remote_storage) => Some(
@@ -721,7 +728,11 @@ impl Tenant {
                                         tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()),
                                     )
                                     .await {
-                                        Ok(p) => p,
+                                        Ok(p) => {
+                                            preload_timer.observe_duration();
+                                            p
+                                        }
+                                            ,
                                         Err(e) => {
                                             make_broken(&tenant_clone, anyhow::anyhow!(e));
                                                 return Ok(());
@@ -732,7 +743,6 @@ impl Tenant {
                         }
                     }
                 };
-                preload_timer.observe_duration();
 
                 // Remote preload is complete.
                 drop(remote_load_completion);
@@ -784,15 +794,19 @@ impl Tenant {
                     }
                 }
 
-                let attach_timer = TENANT.attach.start_timer();
+                // We will time the duration of the attach phase unless this is a creation (attach will do no work)
+                let attach_timer = match mode {
+                    SpawnMode::Create => None,
+                    SpawnMode::Normal => {Some(TENANT.attach.start_timer())}
+                };
                 match tenant_clone.attach(preload, &ctx).await {
                     Ok(()) => {
                         info!("attach finished, activating");
-                        attach_timer.observe_duration();
+                        if let Some(t)=  attach_timer {t.observe_duration();}
                         tenant_clone.activate(broker_client, None, &ctx);
                     }
                     Err(e) => {
-                        attach_timer.observe_duration();
+                        if let Some(t)=  attach_timer {t.observe_duration();}
                         make_broken(&tenant_clone, anyhow::anyhow!(e));
                     }
                 }

From e67b8f69c0e30389cbd037521d5f6024180c6752 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 18 Dec 2023 13:39:48 +0100
Subject: [PATCH 098/209] [PRE-MERGE] pageserver: Reduce tracing overhead in
 timeline::get #6115

Pre-merge `git merge --squash` of
https://github.com/neondatabase/neon/pull/6115

Lowering the tracing level in get_value_reconstruct_data and
get_or_maybe_download from info to debug reduces the overhead
of span creation in non-debug environments.
---
 pageserver/src/tenant/storage_layer/layer.rs | 160 ++++++++++---------
 test_runner/regress/test_broken_timeline.py  |   4 +-
 2 files changed, 85 insertions(+), 79 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index a4b102c314..9a8ddc1a6b 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -259,8 +259,9 @@ impl Layer {
 
         layer
             .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
-            .instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self))
+            .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self))
             .await
+            .with_context(|| format!("get_value_reconstruct_data for layer {self}"))
     }
 
     /// Download the layer if evicted.
@@ -654,7 +655,6 @@ impl LayerInner {
     }
 
     /// Cancellation safe.
-    #[tracing::instrument(skip_all, fields(layer=%self))]
     async fn get_or_maybe_download(
         self: &Arc<Self>,
         allow_download: bool,
@@ -663,95 +663,101 @@ impl LayerInner {
         let mut init_permit = None;
 
         loop {
-            let download = move |permit| async move {
-                // disable any scheduled but not yet running eviction deletions for this
-                let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
+            let download = move |permit| {
+                async move {
+                    // disable any scheduled but not yet running eviction deletions for this
+                    let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
 
-                // count cancellations, which currently remain largely unexpected
-                let init_cancelled =
-                    scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+                    // count cancellations, which currently remain largely unexpected
+                    let init_cancelled =
+                        scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
 
-                // no need to make the evict_and_wait wait for the actual download to complete
-                drop(self.status.send(Status::Downloaded));
+                    // no need to make the evict_and_wait wait for the actual download to complete
+                    drop(self.status.send(Status::Downloaded));
 
-                let timeline = self
-                    .timeline
-                    .upgrade()
-                    .ok_or_else(|| DownloadError::TimelineShutdown)?;
+                    let timeline = self
+                        .timeline
+                        .upgrade()
+                        .ok_or_else(|| DownloadError::TimelineShutdown)?;
 
-                // FIXME: grab a gate
+                    // FIXME: grab a gate
 
-                let can_ever_evict = timeline.remote_client.as_ref().is_some();
+                    let can_ever_evict = timeline.remote_client.as_ref().is_some();
 
-                // check if we really need to be downloaded; could have been already downloaded by a
-                // cancelled previous attempt.
-                let needs_download = self
-                    .needs_download()
-                    .await
-                    .map_err(DownloadError::PreStatFailed)?;
+                    // check if we really need to be downloaded; could have been already downloaded by a
+                    // cancelled previous attempt.
+                    let needs_download = self
+                        .needs_download()
+                        .await
+                        .map_err(DownloadError::PreStatFailed)?;
 
-                let permit = if let Some(reason) = needs_download {
-                    if let NeedsDownload::NotFile(ft) = reason {
-                        return Err(DownloadError::NotFile(ft));
+                    let permit = if let Some(reason) = needs_download {
+                        if let NeedsDownload::NotFile(ft) = reason {
+                            return Err(DownloadError::NotFile(ft));
+                        }
+
+                        // only reset this after we've decided we really need to download. otherwise it'd
+                        // be impossible to mark cancelled downloads for eviction, like one could imagine
+                        // we would like to do for prefetching which was not needed.
+                        self.wanted_evicted.store(false, Ordering::Release);
+
+                        if !can_ever_evict {
+                            return Err(DownloadError::NoRemoteStorage);
+                        }
+
+                        if let Some(ctx) = ctx {
+                            self.check_expected_download(ctx)?;
+                        }
+
+                        if !allow_download {
+                            // this does look weird, but for LayerInner the "downloading" means also changing
+                            // internal once related state ...
+                            return Err(DownloadError::DownloadRequired);
+                        }
+
+                        tracing::info!(%reason, "downloading on-demand");
+
+                        self.spawn_download_and_wait(timeline, permit).await?
+                    } else {
+                        // the file is present locally, probably by a previous but cancelled call to
+                        // get_or_maybe_download. alternatively we might be running without remote storage.
+                        LAYER_IMPL_METRICS.inc_init_needed_no_download();
+
+                        permit
+                    };
+
+                    let since_last_eviction =
+                        self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
+                    if let Some(since_last_eviction) = since_last_eviction {
+                        // FIXME: this will not always be recorded correctly until #6028 (the no
+                        // download needed branch above)
+                        LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
                     }
 
-                    // only reset this after we've decided we really need to download. otherwise it'd
-                    // be impossible to mark cancelled downloads for eviction, like one could imagine
-                    // we would like to do for prefetching which was not needed.
-                    self.wanted_evicted.store(false, Ordering::Release);
+                    let res = Arc::new(DownloadedLayer {
+                        owner: Arc::downgrade(self),
+                        kind: tokio::sync::OnceCell::default(),
+                        version: next_version,
+                    });
 
-                    if !can_ever_evict {
-                        return Err(DownloadError::NoRemoteStorage);
+                    self.access_stats.record_residence_event(
+                        LayerResidenceStatus::Resident,
+                        LayerResidenceEventReason::ResidenceChange,
+                    );
+
+                    let waiters = self.inner.initializer_count();
+                    if waiters > 0 {
+                        tracing::info!(
+                            waiters,
+                            "completing the on-demand download for other tasks"
+                        );
                     }
 
-                    if let Some(ctx) = ctx {
-                        self.check_expected_download(ctx)?;
-                    }
+                    scopeguard::ScopeGuard::into_inner(init_cancelled);
 
-                    if !allow_download {
-                        // this does look weird, but for LayerInner the "downloading" means also changing
-                        // internal once related state ...
-                        return Err(DownloadError::DownloadRequired);
-                    }
-
-                    tracing::info!(%reason, "downloading on-demand");
-
-                    self.spawn_download_and_wait(timeline, permit).await?
-                } else {
-                    // the file is present locally, probably by a previous but cancelled call to
-                    // get_or_maybe_download. alternatively we might be running without remote storage.
-                    LAYER_IMPL_METRICS.inc_init_needed_no_download();
-
-                    permit
-                };
-
-                let since_last_eviction =
-                    self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed());
-                if let Some(since_last_eviction) = since_last_eviction {
-                    // FIXME: this will not always be recorded correctly until #6028 (the no
-                    // download needed branch above)
-                    LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+                    Ok((ResidentOrWantedEvicted::Resident(res), permit))
                 }
-
-                let res = Arc::new(DownloadedLayer {
-                    owner: Arc::downgrade(self),
-                    kind: tokio::sync::OnceCell::default(),
-                    version: next_version,
-                });
-
-                self.access_stats.record_residence_event(
-                    LayerResidenceStatus::Resident,
-                    LayerResidenceEventReason::ResidenceChange,
-                );
-
-                let waiters = self.inner.initializer_count();
-                if waiters > 0 {
-                    tracing::info!(waiters, "completing the on-demand download for other tasks");
-                }
-
-                scopeguard::ScopeGuard::into_inner(init_cancelled);
-
-                Ok((ResidentOrWantedEvicted::Resident(res), permit))
+                .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
             };
 
             if let Some(init_permit) = init_permit.take() {
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 53eeb8bbe9..4da0ba7b20 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -20,7 +20,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
 
     env.pageserver.allowed_errors.extend(
         [
-            ".*layer loading failed:.*",
+            ".*get_value_reconstruct_data for layer .*",
             ".*could not find data for key.*",
             ".*is not active. Current state: Broken.*",
             ".*will not become active. Current state: Broken.*",
@@ -83,7 +83,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
     # (We don't check layer file contents on startup, when loading the timeline)
     #
     # This will change when we implement checksums for layers
-    with pytest.raises(Exception, match="layer loading failed:") as err:
+    with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err:
         pg2.start()
     log.info(
         f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"

From 80be423a58a5fa0518c70a33727678731ecf9561 Mon Sep 17 00:00:00 2001
From: Sasha Krassovsky <krassovskysasha@gmail.com>
Date: Mon, 18 Dec 2023 10:22:36 -0800
Subject: [PATCH 099/209] Grant BYPASSRLS and REPLICATION explicitly to
 neon_superuser roles

---
 compute_tools/src/spec.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index ba1ee6d1b2..20299c8fde 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -298,7 +298,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                 // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
                 // from neon_superuser.
                 let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                     name.pg_quote()
                 );
                 info!("role create query: '{}'", &query);

From 6eda0a3158ce40bd4520ba6e275bd3958f1a7be7 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 19 Dec 2023 13:31:04 +0000
Subject: [PATCH 100/209] [PRE-MERGE] fix metric
 `pageserver_initial_logical_size_start_calculation`

(This is a pre-merge cherry-pick of https://github.com/neondatabase/neon/pull/6191)

It wasn't being incremented.

Fixup of

    commit 1c88824ed0e6bfbce02fa92e13ca91d5ab0e37b3
    Author: Christian Schwarz <christian@neon.tech>
    Date:   Fri Dec 1 12:52:59 2023 +0100

        initial logical size calculation: add a bunch of metrics (#5995)
---
 pageserver/src/metrics.rs | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 45c01b71d1..e34d7c0287 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -522,14 +522,18 @@ pub(crate) mod initial_logical_size {
     impl StartCalculation {
         pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
             let circumstances_label: &'static str = circumstances.into();
-            self.0.with_label_values(&["first", circumstances_label]);
+            self.0
+                .with_label_values(&["first", circumstances_label])
+                .inc();
             OngoingCalculationGuard {
                 inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
             }
         }
         pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
             let circumstances_label: &'static str = circumstances.into();
-            self.0.with_label_values(&["retry", circumstances_label]);
+            self.0
+                .with_label_values(&["retry", circumstances_label])
+                .inc();
             OngoingCalculationGuard {
                 inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
             }

From 39be366fc5a4d3e3c9713c247dc42305f4fc56e4 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 19 Dec 2023 14:46:17 +0100
Subject: [PATCH 101/209] higher resolution histograms for getpage@lsn (#6177)

part of https://github.com/neondatabase/cloud/issues/7811
---
 pageserver/src/metrics.rs | 52 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 51 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index e34d7c0287..4725903783 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1023,12 +1023,62 @@ static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static SMGR_QUERY_TIME_GLOBAL_BUCKETS: Lazy<Vec<f64>> = Lazy::new(|| {
+    [
+        1,
+        10,
+        20,
+        40,
+        60,
+        80,
+        100,
+        200,
+        300,
+        400,
+        500,
+        600,
+        700,
+        800,
+        900,
+        1_000, // 1ms
+        2_000,
+        4_000,
+        6_000,
+        8_000,
+        10_000, // 10ms
+        20_000,
+        40_000,
+        60_000,
+        80_000,
+        100_000,
+        200_000,
+        400_000,
+        600_000,
+        800_000,
+        1_000_000, // 1s
+        2_000_000,
+        4_000_000,
+        6_000_000,
+        8_000_000,
+        10_000_000, // 10s
+        20_000_000,
+        50_000_000,
+        100_000_000,
+        200_000_000,
+        1_000_000_000, // 1000s
+    ]
+    .into_iter()
+    .map(Duration::from_micros)
+    .map(|d| d.as_secs_f64())
+    .collect()
+});
+
 static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_smgr_query_seconds_global",
         "Time spent on smgr query handling, aggregated by query type.",
         &["smgr_query_type"],
-        CRITICAL_OP_BUCKETS.into(),
+        SMGR_QUERY_TIME_GLOBAL_BUCKETS.clone(),
     )
     .expect("failed to define a metric")
 });

From bd4dae8f4a5d4ea3fe2279dbd30a30a0943ffbab Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 1 Jan 2024 14:38:08 +0300
Subject: [PATCH 102/209] compute_ctl: kill postgres and sync-safekeeprs on
 exit.

Otherwise they are left orphaned when compute_ctl is terminated with a
signal. It was invisible most of the time because normally neon_local or k8s
kills postgres directly and then compute_ctl finishes gracefully. However, in
some tests compute_ctl gets stuck waiting for sync-safekeepers which
intentionally never ends because safekeepers are offline, and we want to stop
compute_ctl without leaving orphanes behind.

This is a quite rough approach which doesn't wait for children termination. A
better way would be to convert compute_ctl to async which would make waiting
easy.
---
 Cargo.lock                           |  2 ++
 compute_tools/Cargo.toml             |  2 ++
 compute_tools/src/bin/compute_ctl.rs | 32 +++++++++++++++++++++++++++-
 compute_tools/src/compute.rs         |  8 +++++++
 control_plane/src/endpoint.rs        | 18 ++++++++++++----
 5 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index abd87dc0da..8e0ad7c8ee 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1161,6 +1161,7 @@ dependencies = [
  "flate2",
  "futures",
  "hyper",
+ "nix 0.26.2",
  "notify",
  "num_cpus",
  "opentelemetry",
@@ -1171,6 +1172,7 @@ dependencies = [
  "rust-ini",
  "serde",
  "serde_json",
+ "signal-hook",
  "tar",
  "tokio",
  "tokio-postgres",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 142fa08495..759a117ee9 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -13,6 +13,7 @@ clap.workspace = true
 flate2.workspace = true
 futures.workspace = true
 hyper = { workspace = true, features = ["full"] }
+nix.workspace = true
 notify.workspace = true
 num_cpus.workspace = true
 opentelemetry.workspace = true
@@ -20,6 +21,7 @@ postgres.workspace = true
 regex.workspace = true
 serde.workspace = true
 serde_json.workspace = true
+signal-hook.workspace = true
 tar.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 436db59088..eb1d746f04 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -40,18 +40,22 @@ use std::collections::HashMap;
 use std::fs::File;
 use std::path::Path;
 use std::process::exit;
+use std::sync::atomic::Ordering;
 use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
 use std::{thread, time::Duration};
 
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
+use nix::sys::signal::{kill, Signal};
+use signal_hook::consts::{SIGQUIT, SIGTERM};
+use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info};
 use url::Url;
 
 use compute_api::responses::ComputeStatus;
 
-use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
+use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version;
 use compute_tools::http::api::launch_http_server;
@@ -67,6 +71,13 @@ const BUILD_TAG_DEFAULT: &str = "latest";
 fn main() -> Result<()> {
     init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
 
+    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
+    thread::spawn(move || {
+        for sig in signals.forever() {
+            handle_exit_signal(sig);
+        }
+    });
+
     let build_tag = option_env!("BUILD_TAG")
         .unwrap_or(BUILD_TAG_DEFAULT)
         .to_string();
@@ -346,6 +357,7 @@ fn main() -> Result<()> {
         let ecode = pg
             .wait()
             .expect("failed to start waiting on Postgres process");
+        PG_PID.store(0, Ordering::SeqCst);
         info!("Postgres exited with code {}, shutting down", ecode);
         exit_code = ecode.code()
     }
@@ -519,6 +531,24 @@ fn cli() -> clap::Command {
         )
 }
 
+/// When compute_ctl is killed, send also termination signal to sync-safekeepers
+/// to prevent leakage. TODO: it is better to convert compute_ctl to async and
+/// wait for termination which would be easy then.
+fn handle_exit_signal(sig: i32) {
+    info!("received {sig} termination signal");
+    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
+    if ss_pid != 0 {
+        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
+        kill(ss_pid, Signal::SIGTERM).ok();
+    }
+    let pg_pid = PG_PID.load(Ordering::SeqCst);
+    if pg_pid != 0 {
+        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
+        kill(pg_pid, Signal::SIGTERM).ok();
+    }
+    exit(1);
+}
+
 #[test]
 fn verify_cli() {
     cli().debug_assert()
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index cd7be0520e..13701b7378 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -6,6 +6,8 @@ use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
+use std::sync::atomic::AtomicU32;
+use std::sync::atomic::Ordering;
 use std::sync::{Condvar, Mutex, RwLock};
 use std::thread;
 use std::time::Instant;
@@ -34,6 +36,9 @@ use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
 use crate::{config, extension_server};
 
+pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
+pub static PG_PID: AtomicU32 = AtomicU32::new(0);
+
 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
     // Url type maintains proper escaping
@@ -501,6 +506,7 @@ impl ComputeNode {
             .stdout(Stdio::piped())
             .spawn()
             .expect("postgres --sync-safekeepers failed to start");
+        SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst);
 
         // `postgres --sync-safekeepers` will print all log output to stderr and
         // final LSN to stdout. So we pipe only stdout, while stderr will be automatically
@@ -508,6 +514,7 @@ impl ComputeNode {
         let sync_output = sync_handle
             .wait_with_output()
             .expect("postgres --sync-safekeepers failed");
+        SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);
 
         if !sync_output.status.success() {
             anyhow::bail!(
@@ -662,6 +669,7 @@ impl ComputeNode {
             })
             .spawn()
             .expect("cannot start postgres process");
+        PG_PID.store(pg.id(), Ordering::SeqCst);
 
         wait_for_postgres(&mut pg, pgdata_path)?;
 
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 55b66742ca..3d5dfd6311 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -46,6 +46,8 @@ use std::time::Duration;
 
 use anyhow::{anyhow, bail, Context, Result};
 use compute_api::spec::RemoteExtSpec;
+use nix::sys::signal::kill;
+use nix::sys::signal::Signal;
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId, TimelineId};
 
@@ -439,11 +441,14 @@ impl Endpoint {
         Ok(())
     }
 
-    fn wait_for_compute_ctl_to_exit(&self) -> Result<()> {
+    fn wait_for_compute_ctl_to_exit(&self, send_sigterm: bool) -> Result<()> {
         // TODO use background_process::stop_process instead
         let pidfile_path = self.endpoint_path().join("compute_ctl.pid");
         let pid: u32 = std::fs::read_to_string(pidfile_path)?.parse()?;
         let pid = nix::unistd::Pid::from_raw(pid as i32);
+        if send_sigterm {
+            kill(pid, Signal::SIGTERM).ok();
+        }
         crate::background_process::wait_until_stopped("compute_ctl", pid)?;
         Ok(())
     }
@@ -733,10 +738,15 @@ impl Endpoint {
             &None,
         )?;
 
-        // Also wait for the compute_ctl process to die. It might have some cleanup
-        // work to do after postgres stops, like syncing safekeepers, etc.
+        // Also wait for the compute_ctl process to die. It might have some
+        // cleanup work to do after postgres stops, like syncing safekeepers,
+        // etc.
         //
-        self.wait_for_compute_ctl_to_exit()?;
+        // If destroying, send it SIGTERM before waiting. Sometimes we do *not*
+        // want this cleanup: tests intentionally do stop when majority of
+        // safekeepers is down, so sync-safekeepers would hang otherwise. This
+        // could be a separate flag though.
+        self.wait_for_compute_ctl_to_exit(destroy)?;
         if destroy {
             println!(
                 "Destroying postgres data directory '{}'",

From 392843ad2ac550d63a8847f74f83e955addfb701 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 1 Jan 2024 14:43:44 +0300
Subject: [PATCH 103/209] Fix safekeeper START_REPLICATION (term=n).

It was giving WAL only up to commit_lsn instead of flush_lsn, so recovery of
uncommitted WAL since cdb08f03 hanged. Add test for this.
---
 safekeeper/src/send_wal.rs                    | 11 +----
 .../regress/test_wal_acceptor_async.py        | 40 +++++++++++++++++++
 2 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 44f14f8c7e..70590a0f95 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -391,15 +391,8 @@ impl SafekeeperPostgresHandler {
         // application_name: give only committed WAL (used by pageserver) or all
         // existing WAL (up to flush_lsn, used by walproposer or peer recovery).
         // The second case is always driven by a consensus leader which term
-        // must generally be also supplied. However we're sloppy to do this in
-        // walproposer recovery which will be removed soon. So TODO is to make
-        // it not Option'al then.
-        //
-        // Fetching WAL without term in recovery creates a small risk of this
-        // WAL getting concurrently garbaged if another compute rises which
-        // collects majority and starts fixing log on this safekeeper itself.
-        // That's ok as (old) proposer will never be able to commit such WAL.
-        let end_watch = if self.is_walproposer_recovery() {
+        // must be supplied.
+        let end_watch = if term.is_some() {
             EndWatch::Flush(tli.get_term_flush_lsn_watch_rx())
         } else {
             EndWatch::Commit(tli.get_commit_lsn_watch_rx())
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index feab7e605b..77d67cd63a 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -475,6 +475,46 @@ def test_unavailability(neon_env_builder: NeonEnvBuilder):
     asyncio.run(run_unavailability(env, endpoint))
 
 
+async def run_recovery_uncommitted(env: NeonEnv):
+    (sk1, sk2, _) = env.safekeepers
+
+    env.neon_cli.create_branch("test_recovery_uncommitted")
+    ep = env.endpoints.create_start("test_recovery_uncommitted")
+    ep.safe_psql("create table t(key int, value text)")
+    ep.safe_psql("insert into t select generate_series(1, 100), 'payload'")
+
+    # insert with only one safekeeper up to create tail of flushed but not committed WAL
+    sk1.stop()
+    sk2.stop()
+    conn = await ep.connect_async()
+    # query should hang, so execute in separate task
+    bg_query = asyncio.create_task(
+        conn.execute("insert into t select generate_series(1, 2000), 'payload'")
+    )
+    sleep_sec = 2
+    await asyncio.sleep(sleep_sec)
+    # it must still be not finished
+    assert not bg_query.done()
+    # note: destoy will kill compute_ctl, preventing it waiting for hanging sync-safekeepers.
+    ep.stop_and_destroy()
+
+    # Start one of sks to make quorum online plus compute and ensure they can
+    # sync.
+    sk2.start()
+    ep = env.endpoints.create_start(
+        "test_recovery_uncommitted",
+    )
+    ep.safe_psql("insert into t select generate_series(1, 2000), 'payload'")
+
+
+# Test pulling uncommitted WAL (up to flush_lsn) during recovery.
+def test_recovery_uncommitted(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    asyncio.run(run_recovery_uncommitted(env))
+
+
 @dataclass
 class RaceConditionTest:
     iteration: int

From d6cfcb0d93529bfae5a20a751e3f280fbe8424e9 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 1 Jan 2024 22:33:27 +0300
Subject: [PATCH 104/209] Move failpoint support code to utils.

To enable them in safekeeper as well.
---
 Cargo.lock                                    |  1 +
 libs/pageserver_api/src/models.rs             | 13 -----
 libs/utils/Cargo.toml                         |  7 +++
 .../utils}/src/failpoint_support.rs           | 57 ++++++++++++++++++-
 libs/utils/src/lib.rs                         |  2 +
 pageserver/src/bin/pageserver.rs              |  3 +-
 pageserver/src/http/routes.rs                 | 32 +----------
 pageserver/src/lib.rs                         |  2 -
 pageserver/src/tenant.rs                      |  9 ++-
 pageserver/src/walingest.rs                   |  5 +-
 10 files changed, 74 insertions(+), 57 deletions(-)
 rename {pageserver => libs/utils}/src/failpoint_support.rs (61%)

diff --git a/Cargo.lock b/Cargo.lock
index 8e0ad7c8ee..73cb83d3a7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5880,6 +5880,7 @@ dependencies = [
  "chrono",
  "const_format",
  "criterion",
+ "fail",
  "futures",
  "heapless",
  "hex",
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index be41b610b8..dea925b468 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -557,19 +557,6 @@ pub enum DownloadRemoteLayersTaskState {
     ShutDown,
 }
 
-pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
-
-/// Information for configuring a single fail point
-#[derive(Debug, Serialize, Deserialize)]
-pub struct FailpointConfig {
-    /// Name of the fail point
-    pub name: String,
-    /// List of actions to take, using the format described in `fail::cfg`
-    ///
-    /// We also support `actions = "exit"` to cause the fail point to immediately exit.
-    pub actions: String,
-}
-
 #[derive(Debug, Serialize, Deserialize)]
 pub struct TimelineGcRequest {
     pub gc_horizon: Option<u64>,
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index af0414daa2..706b7a3187 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -4,6 +4,12 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+default = []
+# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
+# which adds some runtime cost to run tests on outage conditions
+testing = ["fail/failpoints"]
+
 [dependencies]
 arc-swap.workspace = true
 sentry.workspace = true
@@ -16,6 +22,7 @@ chrono.workspace = true
 heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
 hyper = { workspace = true, features = ["full"] }
+fail.workspace = true
 futures = { workspace = true}
 jsonwebtoken.workspace = true
 nix.workspace = true
diff --git a/pageserver/src/failpoint_support.rs b/libs/utils/src/failpoint_support.rs
similarity index 61%
rename from pageserver/src/failpoint_support.rs
rename to libs/utils/src/failpoint_support.rs
index 2190eba18a..5ec532e2a6 100644
--- a/pageserver/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -1,3 +1,14 @@
+//! Failpoint support code shared between pageserver and safekeepers.
+
+use crate::http::{
+    error::ApiError,
+    json::{json_request, json_response},
+};
+use hyper::{Body, Request, Response, StatusCode};
+use serde::{Deserialize, Serialize};
+use tokio_util::sync::CancellationToken;
+use tracing::*;
+
 /// use with fail::cfg("$name", "return(2000)")
 ///
 /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
@@ -25,7 +36,7 @@ pub use __failpoint_sleep_millis_async as sleep_millis_async;
 // Helper function used by the macro. (A function has nicer scoping so we
 // don't need to decorate everything with "::")
 #[doc(hidden)]
-pub(crate) async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
+pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
     let millis = duration_str.parse::<u64>().unwrap();
     let d = std::time::Duration::from_millis(millis);
 
@@ -71,7 +82,7 @@ pub fn init() -> fail::FailScenario<'static> {
     scenario
 }
 
-pub(crate) fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
+pub fn apply_failpoint(name: &str, actions: &str) -> Result<(), String> {
     if actions == "exit" {
         fail::cfg_callback(name, exit_failpoint)
     } else {
@@ -84,3 +95,45 @@ fn exit_failpoint() {
     tracing::info!("Exit requested by failpoint");
     std::process::exit(1);
 }
+
+pub type ConfigureFailpointsRequest = Vec<FailpointConfig>;
+
+/// Information for configuring a single fail point
+#[derive(Debug, Serialize, Deserialize)]
+pub struct FailpointConfig {
+    /// Name of the fail point
+    pub name: String,
+    /// List of actions to take, using the format described in `fail::cfg`
+    ///
+    /// We also support `actions = "exit"` to cause the fail point to immediately exit.
+    pub actions: String,
+}
+
+/// Configure failpoints through http.
+pub async fn failpoints_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    if !fail::has_failpoints() {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "Cannot manage failpoints because storage was compiled without failpoints support"
+        )));
+    }
+
+    let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
+    for fp in failpoints {
+        info!("cfg failpoint: {} {}", fp.name, fp.actions);
+
+        // We recognize one extra "action" that's not natively recognized
+        // by the failpoints crate: exit, to immediately kill the process
+        let cfg_result = apply_failpoint(&fp.name, &fp.actions);
+
+        if let Err(err_msg) = cfg_result {
+            return Err(ApiError::BadRequest(anyhow::anyhow!(
+                "Failed to configure failpoints: {err_msg}"
+            )));
+        }
+    }
+
+    json_response(StatusCode::OK, ())
+}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index bb6c848bf4..9e9b0adfe5 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -83,6 +83,8 @@ pub mod timeout;
 
 pub mod sync;
 
+pub mod failpoint_support;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index f65c4f4580..621ad050f4 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -31,6 +31,7 @@ use pageserver::{
     virtual_file,
 };
 use postgres_backend::AuthType;
+use utils::failpoint_support;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
@@ -126,7 +127,7 @@ fn main() -> anyhow::Result<()> {
     }
 
     // Initialize up failpoints support
-    let scenario = pageserver::failpoint_support::init();
+    let scenario = failpoint_support::init();
 
     // Basic initialization of things that don't change after startup
     virtual_file::init(conf.max_file_descriptors);
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 11a3a2c872..157e6b4e3e 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -25,6 +25,7 @@ use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
+use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -66,9 +67,6 @@ use utils::{
     lsn::Lsn,
 };
 
-// Imports only used for testing APIs
-use pageserver_api::models::ConfigureFailpointsRequest;
-
 // For APIs that require an Active tenant, how long should we block waiting for that state?
 // This is not functionally necessary (clients will retry), but avoids generating a lot of
 // failed API calls while tenants are activating.
@@ -1293,34 +1291,6 @@ async fn handle_tenant_break(
     json_response(StatusCode::OK, ())
 }
 
-async fn failpoints_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    if !fail::has_failpoints() {
-        return Err(ApiError::BadRequest(anyhow!(
-            "Cannot manage failpoints because pageserver was compiled without failpoints support"
-        )));
-    }
-
-    let failpoints: ConfigureFailpointsRequest = json_request(&mut request).await?;
-    for fp in failpoints {
-        info!("cfg failpoint: {} {}", fp.name, fp.actions);
-
-        // We recognize one extra "action" that's not natively recognized
-        // by the failpoints crate: exit, to immediately kill the process
-        let cfg_result = crate::failpoint_support::apply_failpoint(&fp.name, &fp.actions);
-
-        if let Err(err_msg) = cfg_result {
-            return Err(ApiError::BadRequest(anyhow!(
-                "Failed to configure failpoints: {err_msg}"
-            )));
-        }
-    }
-
-    json_response(StatusCode::OK, ())
-}
-
 // Run GC immediately on given timeline.
 async fn timeline_gc_handler(
     mut request: Request<Body>,
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 58adf6e8c4..c1ce0af47b 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -25,8 +25,6 @@ pub mod walingest;
 pub mod walrecord;
 pub mod walredo;
 
-pub mod failpoint_support;
-
 use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 2f2169d194..e50987c84b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -33,6 +33,7 @@ use tracing::*;
 use utils::backoff;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
+use utils::failpoint_support;
 use utils::fs_ext;
 use utils::sync::gate::Gate;
 use utils::sync::gate::GateGuard;
@@ -890,7 +891,7 @@ impl Tenant {
     ) -> anyhow::Result<()> {
         span::debug_assert_current_span_has_tenant_id();
 
-        crate::failpoint_support::sleep_millis_async!("before-attaching-tenant");
+        failpoint_support::sleep_millis_async!("before-attaching-tenant");
 
         let preload = match preload {
             Some(p) => p,
@@ -1002,7 +1003,7 @@ impl Tenant {
         // IndexPart is the source of truth.
         self.clean_up_timelines(&existent_timelines)?;
 
-        crate::failpoint_support::sleep_millis_async!("attach-before-activate");
+        failpoint_support::sleep_millis_async!("attach-before-activate");
 
         info!("Done");
 
@@ -2839,9 +2840,7 @@ impl Tenant {
             }
         };
 
-        crate::failpoint_support::sleep_millis_async!(
-            "gc_iteration_internal_after_getting_gc_timelines"
-        );
+        failpoint_support::sleep_millis_async!("gc_iteration_internal_after_getting_gc_timelines");
 
         // If there is nothing to GC, we don't want any messages in the INFO log.
         if !gc_timelines.is_empty() {
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 1d14214030..a6a8972970 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -29,6 +29,7 @@ use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
 use anyhow::{bail, Context, Result};
 use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;
+use utils::failpoint_support;
 
 use crate::context::RequestContext;
 use crate::metrics::WAL_INGEST;
@@ -344,9 +345,7 @@ impl<'a> WalIngest<'a> {
                         // particular point in the WAL. For more fine-grained control,
                         // we could peek into the message and only pause if it contains
                         // a particular string, for example, but this is enough for now.
-                        crate::failpoint_support::sleep_millis_async!(
-                            "wal-ingest-logical-message-sleep"
-                        );
+                        failpoint_support::sleep_millis_async!("wal-ingest-logical-message-sleep");
                     } else if let Some(path) = prefix.strip_prefix("neon-file:") {
                         modification.put_file(path, message, ctx).await?;
                     }

From 2f83f852913887a630517b1b124424946761e7f4 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Mon, 1 Jan 2024 23:32:24 +0300
Subject: [PATCH 105/209] Add failpoint support to safekeeper.

Just a copy paste from pageserver.
---
 Cargo.lock                            |  1 +
 safekeeper/Cargo.toml                 |  7 ++++++
 safekeeper/src/bin/safekeeper.rs      | 17 ++++++++++++-
 safekeeper/src/http/routes.rs         |  8 ++++++
 test_runner/fixtures/neon_fixtures.py | 36 +++++++++++++++++++++++----
 5 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 73cb83d3a7..55e868a6d5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4449,6 +4449,7 @@ dependencies = [
  "clap",
  "const_format",
  "crc32c",
+ "fail",
  "fs2",
  "futures",
  "git-version",
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index cccb4ebd79..4015c27933 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -4,6 +4,12 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+default = []
+# Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
+# which adds some runtime cost to run tests on outage conditions
+testing = ["fail/failpoints"]
+
 [dependencies]
 async-stream.workspace = true
 anyhow.workspace = true
@@ -16,6 +22,7 @@ chrono.workspace = true
 clap = { workspace = true, features = ["derive"] }
 const_format.workspace = true
 crc32c.workspace = true
+fail.workspace = true
 fs2.workspace = true
 git-version.workspace = true
 hex.workspace = true
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index e59deb9fda..33047051df 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -54,6 +54,19 @@ const ID_FILE_NAME: &str = "safekeeper.id";
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 
+const FEATURES: &[&str] = &[
+    #[cfg(feature = "testing")]
+    "testing",
+];
+
+fn version() -> String {
+    format!(
+        "{GIT_VERSION} failpoints: {}, features: {:?}",
+        fail::has_failpoints(),
+        FEATURES,
+    )
+}
+
 const ABOUT: &str = r#"
 A fleet of safekeepers is responsible for reliably storing WAL received from
 compute, passing it through consensus (mitigating potential computes brain
@@ -167,7 +180,9 @@ async fn main() -> anyhow::Result<()> {
     // getting 'argument cannot be used multiple times' error. This seems to be
     // impossible with pure Derive API, so convert struct to Command, modify it,
     // parse arguments, and then fill the struct back.
-    let cmd = <Args as clap::CommandFactory>::command().args_override_self(true);
+    let cmd = <Args as clap::CommandFactory>::command()
+        .args_override_self(true)
+        .version(version());
     let mut matches = cmd.get_matches();
     let mut args = <Args as clap::FromArgMatches>::from_arg_matches_mut(&mut matches)?;
 
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index c48b5330b3..25a3334e63 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -12,6 +12,8 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 use tokio::fs::File;
 use tokio::io::AsyncReadExt;
+use tokio_util::sync::CancellationToken;
+use utils::failpoint_support::failpoints_handler;
 
 use std::io::Write as _;
 use tokio::sync::mpsc;
@@ -444,6 +446,12 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         .data(Arc::new(conf))
         .data(auth)
         .get("/v1/status", |r| request_span(r, status_handler))
+        .put("/v1/failpoints", |r| {
+            request_span(r, move |r| async {
+                let cancel = CancellationToken::new();
+                failpoints_handler(r, cancel).await
+            })
+        })
         // Will be used in the future instead of implicit timeline creation
         .post("/v1/tenant/timeline", |r| {
             request_span(r, timeline_create_handler)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 597e311e02..9aa82d8854 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -890,8 +890,8 @@ class NeonEnv:
         """Get list of safekeeper endpoints suitable for safekeepers GUC"""
         return ",".join(f"localhost:{wa.port.pg}" for wa in self.safekeepers)
 
-    def get_pageserver_version(self) -> str:
-        bin_pageserver = str(self.neon_binpath / "pageserver")
+    def get_binary_version(self, binary_name: str) -> str:
+        bin_pageserver = str(self.neon_binpath / binary_name)
         res = subprocess.run(
             [bin_pageserver, "--version"],
             check=True,
@@ -1656,7 +1656,7 @@ class NeonPageserver(PgProtocol):
         self.running = False
         self.service_port = port
         self.config_override = config_override
-        self.version = env.get_pageserver_version()
+        self.version = env.get_binary_version("pageserver")
 
         # After a test finishes, we will scrape the log to see if there are any
         # unexpected error messages. If your test expects an error, add it to
@@ -2924,7 +2924,8 @@ class Safekeeper:
                 return res
 
     def http_client(self, auth_token: Optional[str] = None) -> SafekeeperHttpClient:
-        return SafekeeperHttpClient(port=self.port.http, auth_token=auth_token)
+        is_testing_enabled = '"testing"' in self.env.get_binary_version("safekeeper")
+        return SafekeeperHttpClient(port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled)
 
     def data_dir(self) -> str:
         return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")
@@ -2975,10 +2976,11 @@ class SafekeeperMetrics:
 class SafekeeperHttpClient(requests.Session):
     HTTPError = requests.HTTPError
 
-    def __init__(self, port: int, auth_token: Optional[str] = None):
+    def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled = False):
         super().__init__()
         self.port = port
         self.auth_token = auth_token
+        self.is_testing_enabled = is_testing_enabled
 
         if auth_token is not None:
             self.headers["Authorization"] = f"Bearer {auth_token}"
@@ -2986,6 +2988,30 @@ class SafekeeperHttpClient(requests.Session):
     def check_status(self):
         self.get(f"http://localhost:{self.port}/v1/status").raise_for_status()
 
+    def is_testing_enabled_or_skip(self):
+        if not self.is_testing_enabled:
+            pytest.skip("safekeeper was built without 'testing' feature")
+
+    def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
+        self.is_testing_enabled_or_skip()
+
+        if isinstance(config_strings, tuple):
+            pairs = [config_strings]
+        else:
+            pairs = config_strings
+
+        log.info(f"Requesting config failpoints: {repr(pairs)}")
+
+        res = self.put(
+            f"http://localhost:{self.port}/v1/failpoints",
+            json=[{"name": name, "actions": actions} for name, actions in pairs],
+        )
+        log.info(f"Got failpoints request response code {res.status_code}")
+        res.raise_for_status()
+        res_json = res.json()
+        assert res_json is None
+        return res_json
+
     def debug_dump(self, params: Optional[Dict[str, str]] = None) -> Dict[str, Any]:
         params = params or {}
         res = self.get(f"http://localhost:{self.port}/v1/debug_dump", params=params)

From 4a227484bff969ddade5b42ee846c6056870ad1a Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Fri, 29 Dec 2023 23:09:36 +0300
Subject: [PATCH 106/209] Add large insertion and slow WAL sending to
 test_hot_standby.

To exercise MAX_SEND_SIZE sending from safekeeper; we've had a bug with WAL
records torn across several XLogData messages. Add failpoint to safekeeper to
slow down sending. Also check for corrupted WAL complains in standby log.

Make the test a bit simpler in passing, e.g. we don't need explicit commits as
autocommit is enabled by default.

https://neondb.slack.com/archives/C05L7D1JAUS/p1703774799114719
https://github.com/neondatabase/cloud/issues/9057
---
 safekeeper/src/send_wal.rs              |  6 ++
 test_runner/fixtures/neon_fixtures.py   | 17 +++--
 test_runner/regress/test_hot_standby.py | 91 +++++++++++++++----------
 3 files changed, 73 insertions(+), 41 deletions(-)

diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 70590a0f95..bd1d306968 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -17,6 +17,7 @@ use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
 use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
+use utils::failpoint_support;
 use utils::id::TenantTimelineId;
 use utils::lsn::AtomicLsn;
 use utils::pageserver_feedback::PageserverFeedback;
@@ -559,6 +560,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                 }))
                 .await?;
 
+            if let Some(appname) = &self.appname {
+                if appname == "replica" {
+                    failpoint_support::sleep_millis_async!("sk-send-wal-replica-sleep");
+                }
+            }
             trace!(
                 "sent {} bytes of WAL {}-{}",
                 send_size,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9aa82d8854..5b1a8ba27d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -347,7 +347,9 @@ class PgProtocol:
         """
         return self.safe_psql_many([query], **kwargs)[0]
 
-    def safe_psql_many(self, queries: List[str], **kwargs: Any) -> List[List[Tuple[Any, ...]]]:
+    def safe_psql_many(
+        self, queries: List[str], log_query=True, **kwargs: Any
+    ) -> List[List[Tuple[Any, ...]]]:
         """
         Execute queries against the node and return all rows.
         This method passes all extra params to connstr.
@@ -356,7 +358,8 @@ class PgProtocol:
         with closing(self.connect(**kwargs)) as conn:
             with conn.cursor() as cur:
                 for query in queries:
-                    log.info(f"Executing query: {query}")
+                    if log_query:
+                        log.info(f"Executing query: {query}")
                     cur.execute(query)
 
                     if cur.description is None:
@@ -365,11 +368,11 @@ class PgProtocol:
                         result.append(cur.fetchall())
         return result
 
-    def safe_psql_scalar(self, query) -> Any:
+    def safe_psql_scalar(self, query, log_query=True) -> Any:
         """
         Execute query returning single row with single column.
         """
-        return self.safe_psql(query)[0][0]
+        return self.safe_psql(query, log_query=log_query)[0][0]
 
 
 @dataclass
@@ -2925,7 +2928,9 @@ class Safekeeper:
 
     def http_client(self, auth_token: Optional[str] = None) -> SafekeeperHttpClient:
         is_testing_enabled = '"testing"' in self.env.get_binary_version("safekeeper")
-        return SafekeeperHttpClient(port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled)
+        return SafekeeperHttpClient(
+            port=self.port.http, auth_token=auth_token, is_testing_enabled=is_testing_enabled
+        )
 
     def data_dir(self) -> str:
         return os.path.join(self.env.repo_dir, "safekeepers", f"sk{self.id}")
@@ -2976,7 +2981,7 @@ class SafekeeperMetrics:
 class SafekeeperHttpClient(requests.Session):
     HTTPError = requests.HTTPError
 
-    def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled = False):
+    def __init__(self, port: int, auth_token: Optional[str] = None, is_testing_enabled=False):
         super().__init__()
         self.port = port
         self.auth_token = auth_token
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 031fd2857d..7822e29ed9 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -1,19 +1,59 @@
+import os
+import re
 import time
 
-from fixtures.neon_fixtures import NeonEnv
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import Endpoint, NeonEnv
+
+
+def wait_caughtup(primary: Endpoint, secondary: Endpoint):
+    primary_lsn = primary.safe_psql_scalar(
+        "SELECT pg_current_wal_insert_lsn()::text", log_query=False
+    )
+    while True:
+        secondary_lsn = secondary.safe_psql_scalar(
+            "SELECT pg_last_wal_replay_lsn()", log_query=False
+        )
+        caught_up = secondary_lsn >= primary_lsn
+        log.info(f"caughtup={caught_up}, primary_lsn={primary_lsn}, secondary_lsn={secondary_lsn}")
+        if caught_up:
+            return
+        time.sleep(1)
+
+
+# Check for corrupted WAL messages which might otherwise go unnoticed if
+# reconnection fixes this.
+def scan_standby_log_for_errors(secondary):
+    log_path = secondary.endpoint_path() / "compute.log"
+    with log_path.open("r") as f:
+        markers = re.compile(
+            r"incorrect resource manager data|record with incorrect|invalid magic number|unexpected pageaddr"
+        )
+        for line in f:
+            if markers.search(line):
+                log.info(f"bad error in standby log: {line}")
+                raise AssertionError()
 
 
 def test_hot_standby(neon_simple_env: NeonEnv):
     env = neon_simple_env
 
+    # We've had a bug caused by WAL records split across multiple XLogData
+    # messages resulting in corrupted WAL complains on standby. It reproduced
+    # only when sending from safekeeper is slow enough to grab full
+    # MAX_SEND_SIZE messages. So insert sleep through failpoints, but only in
+    # one conf to decrease test time.
+    slow_down_send = "[debug-pg16]" in os.environ.get("PYTEST_CURRENT_TEST", "")
+    if slow_down_send:
+        sk_http = env.safekeepers[0].http_client()
+        sk_http.configure_failpoints([("sk-send-wal-replica-sleep", "return(100)")])
+
     with env.endpoints.create_start(
         branch_name="main",
         endpoint_id="primary",
     ) as primary:
         time.sleep(1)
         with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary:
-            primary_lsn = None
-            caught_up = False
             queries = [
                 "SHOW neon.timeline_id",
                 "SHOW neon.tenant_id",
@@ -26,23 +66,6 @@ def test_hot_standby(neon_simple_env: NeonEnv):
                 with p_con.cursor() as p_cur:
                     p_cur.execute("CREATE TABLE test AS SELECT generate_series(1, 100) AS i")
 
-                # Explicit commit to make sure other connections (and replicas) can
-                # see the changes of this commit.
-                p_con.commit()
-
-                with p_con.cursor() as p_cur:
-                    p_cur.execute("SELECT pg_current_wal_insert_lsn()::text")
-                    res = p_cur.fetchone()
-                    assert res is not None
-                    (lsn,) = res
-                    primary_lsn = lsn
-
-                # Explicit commit to make sure other connections (and replicas) can
-                # see the changes of this commit.
-                # Note that this may generate more WAL if the transaction has changed
-                # things, but we don't care about that.
-                p_con.commit()
-
                 for query in queries:
                     with p_con.cursor() as p_cur:
                         p_cur.execute(query)
@@ -51,30 +74,28 @@ def test_hot_standby(neon_simple_env: NeonEnv):
                         response = res
                         responses[query] = response
 
+                # insert more data to make safekeeper send MAX_SEND_SIZE messages
+                if slow_down_send:
+                    primary.safe_psql("create table t(key int, value text)")
+                    primary.safe_psql("insert into t select generate_series(1, 100000), 'payload'")
+
+            wait_caughtup(primary, secondary)
+
             with secondary.connect() as s_con:
                 with s_con.cursor() as s_cur:
                     s_cur.execute("SELECT 1 WHERE pg_is_in_recovery()")
                     res = s_cur.fetchone()
                     assert res is not None
 
-                while not caught_up:
-                    with s_con.cursor() as secondary_cursor:
-                        secondary_cursor.execute("SELECT pg_last_wal_replay_lsn()")
-                        res = secondary_cursor.fetchone()
-                        assert res is not None
-                        (secondary_lsn,) = res
-                        # There may be more changes on the primary after we got our LSN
-                        # due to e.g. autovacuum, but that shouldn't impact the content
-                        # of the tables, so we check whether we've replayed up to at
-                        # least after the commit of the `test` table.
-                        caught_up = secondary_lsn >= primary_lsn
-
-                # Explicit commit to flush any transient transaction-level state.
-                s_con.commit()
-
                 for query in queries:
                     with s_con.cursor() as secondary_cursor:
                         secondary_cursor.execute(query)
                         response = secondary_cursor.fetchone()
                         assert response is not None
                         assert response == responses[query]
+
+            scan_standby_log_for_errors(secondary)
+
+    # clean up
+    if slow_down_send:
+        sk_http.configure_failpoints(("sk-send-wal-replica-sleep", "off"))

From 54aa31980597742f28b022bd0a729b97d910f0b7 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Sat, 30 Dec 2023 00:31:19 +0300
Subject: [PATCH 107/209] Don't split WAL record across two XLogData's when
 sending from safekeepers.

As protocol demands. Not following this makes standby complain about corrupted
WAL in various ways.

https://neondb.slack.com/archives/C05L7D1JAUS/p1703774799114719
closes https://github.com/neondatabase/cloud/issues/9057
---
 safekeeper/src/send_wal.rs    | 22 +++++++++++++++-------
 safekeeper/src/wal_storage.rs |  3 +++
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index bd1d306968..9a5657a40d 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -529,12 +529,19 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
             );
 
             // try to send as much as available, capped by MAX_SEND_SIZE
-            let mut send_size = self
-                .end_pos
-                .checked_sub(self.start_pos)
-                .context("reading wal without waiting for it first")?
-                .0 as usize;
-            send_size = min(send_size, self.send_buf.len());
+            let mut chunk_end_pos = self.start_pos + MAX_SEND_SIZE as u64;
+            // if we went behind available WAL, back off
+            if chunk_end_pos >= self.end_pos {
+                chunk_end_pos = self.end_pos;
+            } else {
+                // If sending not up to end pos, round down to page boundary to
+                // avoid breaking WAL record not at page boundary, as protocol
+                // demands. See walsender.c (XLogSendPhysical).
+                chunk_end_pos = chunk_end_pos
+                    .checked_sub(chunk_end_pos.block_offset())
+                    .unwrap();
+            }
+            let send_size = (chunk_end_pos.0 - self.start_pos.0) as usize;
             let send_buf = &mut self.send_buf[..send_size];
             let send_size: usize;
             {
@@ -545,7 +552,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                 } else {
                     None
                 };
-                // read wal into buffer
+                // Read WAL into buffer. send_size can be additionally capped to
+                // segment boundary here.
                 send_size = self.wal_reader.read(send_buf).await?
             };
             let send_buf = &send_buf[..send_size];
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index fa44b24258..e7538f805c 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -565,6 +565,9 @@ impl WalReader {
         })
     }
 
+    /// Read WAL at current position into provided buf, returns number of bytes
+    /// read. It can be smaller than buf size only if segment boundary is
+    /// reached.
     pub async fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
         // If this timeline is new, we may not have a full segment yet, so
         // we pad the first bytes of the timeline's first WAL segment with 0s

From aa72a22661ca841868ab11b21137c47e6c07c3e3 Mon Sep 17 00:00:00 2001
From: vipvap <91739071+vipvap@users.noreply.github.com>
Date: Mon, 8 Jan 2024 09:26:27 +0000
Subject: [PATCH 108/209] Release 2024-01-08 (#6286)

Release 2024-01-08
---
 .github/workflows/build_and_test.yml          |   8 +-
 Cargo.lock                                    |   3 +
 compute_tools/src/monitor.rs                  |  25 +-
 control_plane/src/pageserver.rs               |   7 +
 control_plane/src/tenant_migration.rs         |  21 +-
 docs/sourcetree.md                            |   8 +-
 libs/pageserver_api/src/key.rs                |   2 +-
 libs/pageserver_api/src/keyspace.rs           |   3 +
 libs/pageserver_api/src/shard.rs              |  22 +-
 libs/postgres_backend/src/lib.rs              |  23 +-
 libs/remote_storage/src/azure_blob.rs         |   6 +
 libs/remote_storage/src/lib.rs                |  13 +
 libs/remote_storage/src/local_fs.rs           |  14 +
 libs/remote_storage/src/s3_bucket.rs          |  32 +
 libs/remote_storage/src/s3_bucket/metrics.rs  |   8 +-
 libs/remote_storage/src/simulate_failures.rs  |   7 +
 libs/safekeeper_api/src/models.rs             |   6 +
 libs/utils/src/http/error.rs                  |   7 +
 libs/utils/src/lib.rs                         |   2 +
 libs/utils/src/sync/gate.rs                   |   6 +
 libs/utils/src/yielding_loop.rs               |  35 +
 libs/walproposer/src/walproposer.rs           |   2 +-
 pageserver/benches/bench_walredo.rs           |   5 +-
 pageserver/client/src/mgmt_api.rs             |  14 +-
 pageserver/client/src/page_service.rs         |   9 +-
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |  38 +-
 pageserver/src/basebackup.rs                  |  15 +-
 pageserver/src/config.rs                      |  58 +-
 pageserver/src/http/routes.rs                 |  36 +
 pageserver/src/import_datadir.rs              |   9 +-
 pageserver/src/lib.rs                         |   4 +
 pageserver/src/metrics.rs                     |  96 ++-
 pageserver/src/page_service.rs                | 240 ++++--
 pageserver/src/pgdatadir_mapping.rs           | 275 ++++--
 pageserver/src/task_mgr.rs                    |   5 +-
 pageserver/src/tenant.rs                      |  17 +-
 pageserver/src/tenant/config.rs               |   2 +
 pageserver/src/tenant/delete.rs               |   2 +-
 pageserver/src/tenant/mgr.rs                  | 289 ++++---
 .../src/tenant/remote_timeline_client.rs      |  28 +-
 pageserver/src/tenant/secondary.rs            | 142 +++-
 pageserver/src/tenant/secondary/downloader.rs | 801 ++++++++++++++++++
 .../src/tenant/secondary/heatmap_uploader.rs  | 502 ++++-------
 pageserver/src/tenant/secondary/scheduler.rs  | 361 ++++++++
 .../tenant/storage_layer/inmemory_layer.rs    |  43 +-
 pageserver/src/tenant/storage_layer/layer.rs  |   1 +
 pageserver/src/tenant/tasks.rs                |   2 +
 pageserver/src/tenant/timeline.rs             | 173 ++--
 pageserver/src/tenant/timeline/walreceiver.rs |   1 +
 .../walreceiver/connection_manager.rs         |   3 +
 .../walreceiver/walreceiver_connection.rs     |  36 +-
 pageserver/src/walingest.rs                   | 272 +++---
 pageserver/src/walredo.rs                     |  64 +-
 pgxn/neon/libpagestore.c                      |  25 +-
 pgxn/neon/walproposer.c                       | 268 +++---
 pgxn/neon/walproposer.h                       |  16 +-
 pgxn/neon/walproposer_pg.c                    |  77 +-
 poetry.lock                                   | 174 +---
 pre-commit.py                                 |  24 +-
 pyproject.toml                                |  20 +-
 s3_scrubber/Cargo.toml                        |   3 +
 s3_scrubber/src/lib.rs                        |   8 +-
 s3_scrubber/src/main.rs                       |  57 +-
 s3_scrubber/src/scan_metadata.rs              |  11 +-
 safekeeper/Cargo.toml                         |   1 +
 safekeeper/src/control_file.rs                |   7 +-
 safekeeper/src/copy_timeline.rs               | 250 ++++++
 safekeeper/src/debug_dump.rs                  |  57 ++
 safekeeper/src/http/routes.rs                 |  65 +-
 safekeeper/src/lib.rs                         |   1 +
 safekeeper/src/pull_timeline.rs               | 131 ++-
 safekeeper/src/timeline.rs                    |   3 +-
 safekeeper/src/timelines_global_map.rs        |  15 +-
 safekeeper/src/wal_backup.rs                  |  61 +-
 safekeeper/src/wal_storage.rs                 |   2 +-
 scripts/export_import_between_pageservers.py  |   2 +-
 scripts/reformat                              |   4 +-
 test_runner/fixtures/neon_fixtures.py         |  26 +-
 test_runner/fixtures/pageserver/http.py       |  10 +-
 test_runner/performance/test_perf_olap.py     |   7 +-
 .../performance/test_wal_backpressure.py      |   3 +-
 .../regress/test_attach_tenant_config.py      |  13 +-
 test_runner/regress/test_compatibility.py     |   8 +-
 test_runner/regress/test_crafted_wal_end.py   |   1 -
 test_runner/regress/test_layer_eviction.py    |   4 +-
 .../regress/test_layers_from_future.py        |   3 +
 test_runner/regress/test_pageserver_api.py    |   3 +-
 .../regress/test_pageserver_secondary.py      | 149 +++-
 test_runner/regress/test_tenant_detach.py     |  14 +-
 test_runner/regress/test_tenant_relocation.py |   6 +-
 .../test_tenants_with_remote_storage.py       |   8 +-
 test_runner/regress/test_wal_acceptor.py      |  82 +-
 vm-image-spec.yaml                            |  21 +-
 93 files changed, 4014 insertions(+), 1429 deletions(-)
 create mode 100644 libs/utils/src/yielding_loop.rs
 create mode 100644 pageserver/src/tenant/secondary/downloader.rs
 create mode 100644 pageserver/src/tenant/secondary/scheduler.rs
 create mode 100644 safekeeper/src/copy_timeline.rs

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 78deff6e85..880d6044f2 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -105,11 +105,11 @@ jobs:
       - name: Install Python deps
         run: ./scripts/pysync
 
-      - name: Run ruff to ensure code format
-        run: poetry run ruff .
+      - name: Run `ruff check` to ensure code format
+        run: poetry run ruff check .
 
-      - name: Run black to ensure code format
-        run: poetry run black --diff --check .
+      - name: Run `ruff format` to ensure code format
+        run: poetry run ruff format --check .
 
       - name: Run mypy to check types
         run: poetry run mypy .
diff --git a/Cargo.lock b/Cargo.lock
index 55e868a6d5..4dd195a895 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4405,12 +4405,14 @@ dependencies = [
  "async-stream",
  "aws-config",
  "aws-sdk-s3",
+ "aws-smithy-async",
  "bincode",
  "bytes",
  "chrono",
  "clap",
  "crc32c",
  "either",
+ "futures",
  "futures-util",
  "hex",
  "histogram",
@@ -4473,6 +4475,7 @@ dependencies = [
  "serde",
  "serde_json",
  "serde_with",
+ "sha2",
  "signal-hook",
  "storage_broker",
  "thiserror",
diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index f974d6023d..fd19b7e53f 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -3,7 +3,7 @@ use std::{thread, time::Duration};
 
 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
-use tracing::{debug, info};
+use tracing::{debug, info, warn};
 
 use crate::compute::ComputeNode;
 
@@ -84,6 +84,29 @@ fn watch_compute_activity(compute: &ComputeNode) {
                     }
                 }
 
+                // If there are existing (logical) walsenders, do not suspend.
+                //
+                // walproposer doesn't currently show up in pg_stat_replication,
+                // but protect if it will be
+                let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';";
+                match cli.query_one(ws_count_query, &[]) {
+                    Ok(r) => match r.try_get::<&str, i64>("count") {
+                        Ok(num_ws) => {
+                            if num_ws > 0 {
+                                last_active = Some(Utc::now());
+                            }
+                        }
+                        Err(e) => {
+                            warn!("failed to parse ws count: {:?}", e);
+                            continue;
+                        }
+                    },
+                    Err(e) => {
+                        warn!("failed to get list of walsenders: {:?}", e);
+                        continue;
+                    }
+                }
+
                 // Update the last activity in the shared state if we got a more recent one.
                 let mut state = compute.state.lock().unwrap();
                 // NB: `Some(<DateTime>)` is always greater than `None`.
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 7d490016bf..fb0d251722 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -485,6 +485,13 @@ impl PageServerNode {
         Ok(self.http_client.list_timelines(*tenant_id).await?)
     }
 
+    pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
+        Ok(self
+            .http_client
+            .tenant_secondary_download(*tenant_id)
+            .await?)
+    }
+
     pub async fn timeline_create(
         &self,
         tenant_id: TenantId,
diff --git a/control_plane/src/tenant_migration.rs b/control_plane/src/tenant_migration.rs
index 79df108896..23ea8f4060 100644
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -11,6 +11,7 @@ use crate::{
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
+use pageserver_api::shard::TenantShardId;
 use std::collections::HashMap;
 use std::time::Duration;
 use utils::{
@@ -40,9 +41,9 @@ async fn await_lsn(
     loop {
         let latest = match get_lsns(tenant_id, pageserver).await {
             Ok(l) => l,
-            Err(e) => {
+            Err(_e) => {
                 println!(
-                    "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
+                    "🕑 Waiting for pageserver {} to activate...",
                     pageserver.conf.id
                 );
                 std::thread::sleep(Duration::from_millis(500));
@@ -89,7 +90,7 @@ pub async fn migrate_tenant(
     tenant_id: TenantId,
     dest_ps: PageServerNode,
 ) -> anyhow::Result<()> {
-    // Get a new generation
+    println!("🤔 Checking existing status...");
     let attachment_service = AttachmentService::from_env(env);
 
     fn build_location_config(
@@ -135,6 +136,20 @@ pub async fn migrate_tenant(
         baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
     }
 
+    println!(
+        "🔁 Downloading latest layers to destination pageserver {}",
+        dest_ps.conf.id
+    );
+    match dest_ps
+        .tenant_secondary_download(&TenantShardId::unsharded(tenant_id))
+        .await
+    {
+        Ok(()) => {}
+        Err(_) => {
+            println!("  (skipping, destination wasn't in secondary mode)")
+        }
+    }
+
     let gen = attachment_service
         .attach_hook(tenant_id, dest_ps.conf.id)
         .await?;
diff --git a/docs/sourcetree.md b/docs/sourcetree.md
index 95bed83ae5..12fa80349e 100644
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -129,13 +129,13 @@ Run `poetry shell` to activate the virtual environment.
 Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.
 
 ### Obligatory checks
-We force code formatting via `black`, `ruff`, and type hints via `mypy`.
+We force code formatting via `ruff`, and type hints via `mypy`.
 Run the following commands in the repository's root (next to `pyproject.toml`):
 
 ```bash
-poetry run black .  # All code is reformatted
-poetry run ruff .  # Python linter
-poetry run mypy .  # Ensure there are no typing errors
+poetry run ruff format . # All code is reformatted
+poetry run ruff check .  # Python linter
+poetry run mypy .        # Ensure there are no typing errors
 ```
 
 **WARNING**: do not run `mypy` from a directory other than the root of the repository.
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index d680a5600e..3e1bba2a06 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -142,7 +142,7 @@ impl Key {
 }
 
 pub fn is_rel_block_key(key: &Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0
+    key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
 }
 
 impl std::str::FromStr for Key {
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 16651c322e..80183506d8 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -124,6 +124,9 @@ impl KeySpaceAccum {
                 if range.start == accum.end {
                     accum.end = range.end;
                 } else {
+                    // TODO: to efficiently support small sharding stripe sizes, we should avoid starting
+                    // a new range here if the skipped region was all keys that don't belong on this shard.
+                    // (https://github.com/neondatabase/neon/issues/6247)
                     assert!(range.start > accum.end);
                     self.ranges.push(accum.clone());
                     *accum = range;
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 3e4936eec4..18ef2be523 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -422,6 +422,21 @@ impl ShardIdentity {
         }
     }
 
+    /// Return true if the key should be discarded if found in this shard's
+    /// data store, e.g. during compaction after a split
+    pub fn is_key_disposable(&self, key: &Key) -> bool {
+        if key_is_shard0(key) {
+            // Q: Why can't we dispose of shard0 content if we're not shard 0?
+            // A: because the WAL ingestion logic currently ingests some shard 0
+            //    content on all shards, even though it's only read on shard 0.  If we
+            //    dropped it, then subsequent WAL ingest to these keys would encounter
+            //    an error.
+            false
+        } else {
+            !self.is_key_local(key)
+        }
+    }
+
     pub fn shard_slug(&self) -> String {
         if self.count > ShardCount(0) {
             format!("-{:02x}{:02x}", self.number.0, self.count.0)
@@ -515,12 +530,7 @@ fn key_is_shard0(key: &Key) -> bool {
     // relation pages are distributed to shards other than shard zero. Everything else gets
     // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
     // requests, and any request other than those for particular blocks in relations.
-    //
-    // In this condition:
-    // - is_rel_block_key includes only relations, i.e. excludes SLRU data and
-    // all metadata.
-    // - field6 is set to -1 for relation size pages.
-    !(is_rel_block_key(key) && key.field6 != 0xffffffff)
+    !is_rel_block_key(key)
 }
 
 /// Provide the same result as the function in postgres `hashfn.h` with the same name
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 1dae008a4f..73d25619c3 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -35,6 +35,12 @@ pub enum QueryError {
     /// We were instructed to shutdown while processing the query
     #[error("Shutting down")]
     Shutdown,
+    /// Query handler indicated that client should reconnect
+    #[error("Server requested reconnect")]
+    Reconnect,
+    /// Query named an entity that was not found
+    #[error("Not found: {0}")]
+    NotFound(std::borrow::Cow<'static, str>),
     /// Authentication failure
     #[error("Unauthorized: {0}")]
     Unauthorized(std::borrow::Cow<'static, str>),
@@ -54,9 +60,9 @@ impl From<io::Error> for QueryError {
 impl QueryError {
     pub fn pg_error_code(&self) -> &'static [u8; 5] {
         match self {
-            Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
+            Self::Disconnected(_) | Self::SimulatedConnectionError | Self::Reconnect => b"08006", // connection failure
             Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
-            Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
+            Self::Unauthorized(_) | Self::NotFound(_) => SQLSTATE_INTERNAL_ERROR,
             Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
         }
     }
@@ -425,6 +431,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                 info!("Stopped due to shutdown");
                 Ok(())
             }
+            Err(QueryError::Reconnect) => {
+                // Dropping out of this loop implicitly disconnects
+                info!("Stopped due to handler reconnect request");
+                Ok(())
+            }
             Err(QueryError::Disconnected(e)) => {
                 info!("Disconnected ({e:#})");
                 // Disconnection is not an error: we just use it that way internally to drop
@@ -974,7 +985,9 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
 pub fn short_error(e: &QueryError) -> String {
     match e {
         QueryError::Disconnected(connection_error) => connection_error.to_string(),
+        QueryError::Reconnect => "reconnect".to_string(),
         QueryError::Shutdown => "shutdown".to_string(),
+        QueryError::NotFound(_) => "not found".to_string(),
         QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
         QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
         QueryError::Other(e) => format!("{e:#}"),
@@ -996,9 +1009,15 @@ fn log_query_error(query: &str, e: &QueryError) {
         QueryError::SimulatedConnectionError => {
             error!("query handler for query '{query}' failed due to a simulated connection error")
         }
+        QueryError::Reconnect => {
+            info!("query handler for '{query}' requested client to reconnect")
+        }
         QueryError::Shutdown => {
             info!("query handler for '{query}' cancelled during tenant shutdown")
         }
+        QueryError::NotFound(reason) => {
+            info!("query handler for '{query}' entity not found: {reason}")
+        }
         QueryError::Unauthorized(e) => {
             warn!("query handler for '{query}' failed with authentication error: {e}");
         }
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 7ea1103eb2..18cf5d97ba 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -322,6 +322,12 @@ impl RemoteStorage for AzureBlobStorage {
         }
         Ok(())
     }
+
+    async fn copy(&self, _from: &RemotePath, _to: &RemotePath) -> anyhow::Result<()> {
+        Err(anyhow::anyhow!(
+            "copy for azure blob storage is not implemented"
+        ))
+    }
 }
 
 pin_project_lite::pin_project! {
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 3e408e3119..942d0016b0 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -207,6 +207,9 @@ pub trait RemoteStorage: Send + Sync + 'static {
     async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
 
     async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
+
+    /// Copy a remote object inside a bucket from one path to another.
+    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
 }
 
 pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
@@ -374,6 +377,15 @@ impl GenericRemoteStorage {
             Self::Unreliable(s) => s.delete_objects(paths).await,
         }
     }
+
+    pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+        match self {
+            Self::LocalFs(s) => s.copy(from, to).await,
+            Self::AwsS3(s) => s.copy(from, to).await,
+            Self::AzureBlob(s) => s.copy(from, to).await,
+            Self::Unreliable(s) => s.copy(from, to).await,
+        }
+    }
 }
 
 impl GenericRemoteStorage {
@@ -660,6 +672,7 @@ impl ConcurrencyLimiter {
             RequestKind::Put => &self.write,
             RequestKind::List => &self.read,
             RequestKind::Delete => &self.write,
+            RequestKind::Copy => &self.write,
         }
     }
 
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index d1e7d325b9..bf8b6b5dde 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -409,6 +409,20 @@ impl RemoteStorage for LocalFs {
         }
         Ok(())
     }
+
+    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+        let from_path = from.with_base(&self.storage_root);
+        let to_path = to.with_base(&self.storage_root);
+        create_target_directory(&to_path).await?;
+        fs::copy(&from_path, &to_path).await.with_context(|| {
+            format!(
+                "Failed to copy file from '{from_path}' to '{to_path}'",
+                from_path = from_path,
+                to_path = to_path
+            )
+        })?;
+        Ok(())
+    }
 }
 
 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 0f95458ad1..d7b41edaaf 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -493,6 +493,38 @@ impl RemoteStorage for S3Bucket {
         Ok(())
     }
 
+    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+        let kind = RequestKind::Copy;
+        let _guard = self.permit(kind).await;
+
+        let started_at = start_measuring_requests(kind);
+
+        // we need to specify bucket_name as a prefix
+        let copy_source = format!(
+            "{}/{}",
+            self.bucket_name,
+            self.relative_path_to_s3_object(from)
+        );
+
+        let res = self
+            .client
+            .copy_object()
+            .bucket(self.bucket_name.clone())
+            .key(self.relative_path_to_s3_object(to))
+            .copy_source(copy_source)
+            .send()
+            .await;
+
+        let started_at = ScopeGuard::into_inner(started_at);
+        metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+
+        res?;
+
+        Ok(())
+    }
+
     async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
         // if prefix is not none then download file `prefix/from`
         // if prefix is none then download file `from`
diff --git a/libs/remote_storage/src/s3_bucket/metrics.rs b/libs/remote_storage/src/s3_bucket/metrics.rs
index ea11edafa5..21dde14906 100644
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -11,6 +11,7 @@ pub(crate) enum RequestKind {
     Put = 1,
     Delete = 2,
     List = 3,
+    Copy = 4,
 }
 
 use RequestKind::*;
@@ -22,6 +23,7 @@ impl RequestKind {
             Put => "put_object",
             Delete => "delete_object",
             List => "list_objects",
+            Copy => "copy_object",
         }
     }
     const fn as_index(&self) -> usize {
@@ -29,7 +31,7 @@ impl RequestKind {
     }
 }
 
-pub(super) struct RequestTyped<C>([C; 4]);
+pub(super) struct RequestTyped<C>([C; 5]);
 
 impl<C> RequestTyped<C> {
     pub(super) fn get(&self, kind: RequestKind) -> &C {
@@ -38,8 +40,8 @@ impl<C> RequestTyped<C> {
 
     fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
         use RequestKind::*;
-        let mut it = [Get, Put, Delete, List].into_iter();
-        let arr = std::array::from_fn::<C, 4, _>(|index| {
+        let mut it = [Get, Put, Delete, List, Copy].into_iter();
+        let arr = std::array::from_fn::<C, 5, _>(|index| {
             let next = it.next().unwrap();
             assert_eq!(index, next.as_index());
             f(next)
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 802b0db7f5..7f5adcea30 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -162,4 +162,11 @@ impl RemoteStorage for UnreliableWrapper {
         }
         Ok(())
     }
+
+    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+        // copy is equivalent to download + upload
+        self.attempt(RemoteOp::Download(from.clone()))?;
+        self.attempt(RemoteOp::Upload(to.clone()))?;
+        self.inner.copy_object(from, to).await
+    }
 }
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index 786712deb1..ce5a1e411e 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -51,3 +51,9 @@ pub struct SkTimelineInfo {
     #[serde(default)]
     pub http_connstr: Option<String>,
 }
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TimelineCopyRequest {
+    pub target_timeline_id: TimelineId,
+    pub until_lsn: Lsn,
+}
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index ac68b04888..3e9281ac81 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -31,6 +31,9 @@ pub enum ApiError {
     #[error("Shutting down")]
     ShuttingDown,
 
+    #[error("Timeout")]
+    Timeout(Cow<'static, str>),
+
     #[error(transparent)]
     InternalServerError(anyhow::Error),
 }
@@ -67,6 +70,10 @@ impl ApiError {
                 err.to_string(),
                 StatusCode::SERVICE_UNAVAILABLE,
             ),
+            ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
+                err.to_string(),
+                StatusCode::REQUEST_TIMEOUT,
+            ),
             ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                 err.to_string(),
                 StatusCode::INTERNAL_SERVER_ERROR,
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 9e9b0adfe5..890061dc59 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -85,6 +85,8 @@ pub mod sync;
 
 pub mod failpoint_support;
 
+pub mod yielding_loop;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
diff --git a/libs/utils/src/sync/gate.rs b/libs/utils/src/sync/gate.rs
index 31c76d2f74..abc3842da8 100644
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -15,6 +15,12 @@ pub struct Gate {
     name: String,
 }
 
+impl std::fmt::Debug for Gate {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Gate<{}>", self.name)
+    }
+}
+
 /// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
 /// not complete.
 #[derive(Debug)]
diff --git a/libs/utils/src/yielding_loop.rs b/libs/utils/src/yielding_loop.rs
new file mode 100644
index 0000000000..963279eb4c
--- /dev/null
+++ b/libs/utils/src/yielding_loop.rs
@@ -0,0 +1,35 @@
+use tokio_util::sync::CancellationToken;
+
+#[derive(thiserror::Error, Debug)]
+pub enum YieldingLoopError {
+    #[error("Cancelled")]
+    Cancelled,
+}
+
+/// Helper for long synchronous loops, e.g. over all tenants in the system.  Periodically
+/// yields to avoid blocking the executor, and after resuming checks the provided
+/// cancellation token to drop out promptly on shutdown.
+#[inline(always)]
+pub async fn yielding_loop<I, T, F>(
+    interval: usize,
+    cancel: &CancellationToken,
+    iter: I,
+    mut visitor: F,
+) -> Result<(), YieldingLoopError>
+where
+    I: Iterator<Item = T>,
+    F: FnMut(T),
+{
+    for (i, item) in iter.enumerate() {
+        visitor(item);
+
+        if i + 1 % interval == 0 {
+            tokio::task::yield_now().await;
+            if cancel.is_cancelled() {
+                return Err(YieldingLoopError::Cancelled);
+            }
+        }
+    }
+
+    Ok(())
+}
diff --git a/libs/walproposer/src/walproposer.rs b/libs/walproposer/src/walproposer.rs
index 35c8f6904d..7251545792 100644
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -425,7 +425,7 @@ mod tests {
         }
 
         fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
-            println!("walprop_log[{}] {}", level, msg);
+            println!("wp_log[{}] {}", level, msg);
         }
 
         fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index ba41866935..4837626086 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -13,6 +13,7 @@ use bytes::{Buf, Bytes};
 use pageserver::{
     config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
 };
+use pageserver_api::shard::TenantShardId;
 use utils::{id::TenantId, lsn::Lsn};
 
 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
@@ -26,9 +27,9 @@ fn redo_scenarios(c: &mut Criterion) {
 
     let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
     let conf = Box::leak(Box::new(conf));
-    let tenant_id = TenantId::generate();
+    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
 
-    let manager = PostgresRedoManager::new(conf, tenant_id);
+    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
 
     let manager = Arc::new(manager);
 
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 87e4ed8efd..4c285293f7 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,4 +1,4 @@
-use pageserver_api::models::*;
+use pageserver_api::{models::*, shard::TenantShardId};
 use reqwest::{IntoUrl, Method};
 use utils::{
     http::error::HttpErrorBody,
@@ -164,6 +164,18 @@ impl Client {
         Ok(())
     }
 
+    pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{}/secondary/download",
+            self.mgmt_api_endpoint, tenant_id
+        );
+        self.request(Method::POST, &uri, ())
+            .await?
+            .error_for_status()
+            .map(|_| ())
+            .map_err(|e| Error::ApiError(format!("{}", e)))
+    }
+
     pub async fn location_config(
         &self,
         tenant_id: TenantId,
diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs
index fc0d2311f7..231461267a 100644
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -115,15 +115,8 @@ impl PagestreamClient {
 
     pub async fn getpage(
         &mut self,
-        key: RelTagBlockNo,
-        lsn: Lsn,
+        req: PagestreamGetPageRequest,
     ) -> anyhow::Result<PagestreamGetPageResponse> {
-        let req = PagestreamGetPageRequest {
-            latest: false,
-            rel: key.rel_tag,
-            blkno: key.block_no,
-            lsn,
-        };
         let req = PagestreamFeMessage::GetPage(req);
         let req: bytes::Bytes = req.serialize();
         // let mut req = tokio_util::io::ReaderStream::new(&req);
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 16d198ab0e..cb36a403f1 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -3,7 +3,7 @@ use futures::future::join_all;
 use pageserver::pgdatadir_mapping::key_to_rel_block;
 use pageserver::repository;
 use pageserver_api::key::is_rel_block_key;
-use pageserver_client::page_service::RelTagBlockNo;
+use pageserver_api::models::PagestreamGetPageRequest;
 
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
@@ -39,6 +39,9 @@ pub(crate) struct Args {
     runtime: Option<humantime::Duration>,
     #[clap(long)]
     per_target_rate_limit: Option<usize>,
+    /// Probability for sending `latest=true` in the request (uniform distribution).
+    #[clap(long, default_value = "1")]
+    req_latest_probability: f64,
     #[clap(long)]
     limit_to_first_n_targets: Option<usize>,
     targets: Option<Vec<TenantTimelineId>>,
@@ -200,18 +203,26 @@ async fn main_impl(
             start_work_barrier.wait().await;
 
             loop {
-                let (range, key) = {
+                let (timeline, req) = {
                     let mut rng = rand::thread_rng();
                     let r = &all_ranges[weights.sample(&mut rng)];
                     let key: i128 = rng.gen_range(r.start..r.end);
                     let key = repository::Key::from_i128(key);
                     let (rel_tag, block_no) =
                         key_to_rel_block(key).expect("we filter non-rel-block keys out above");
-                    (r, RelTagBlockNo { rel_tag, block_no })
+                    (
+                        r.timeline,
+                        PagestreamGetPageRequest {
+                            latest: rng.gen_bool(args.req_latest_probability),
+                            lsn: r.timeline_lsn,
+                            rel: rel_tag,
+                            blkno: block_no,
+                        },
+                    )
                 };
-                let sender = work_senders.get(&range.timeline).unwrap();
+                let sender = work_senders.get(&timeline).unwrap();
                 // TODO: what if this blocks?
-                sender.send((key, range.timeline_lsn)).await.ok().unwrap();
+                sender.send(req).await.ok().unwrap();
             }
         }),
         Some(rps_limit) => Box::pin(async move {
@@ -240,16 +251,21 @@ async fn main_impl(
                     );
                     loop {
                         ticker.tick().await;
-                        let (range, key) = {
+                        let req = {
                             let mut rng = rand::thread_rng();
                             let r = &ranges[weights.sample(&mut rng)];
                             let key: i128 = rng.gen_range(r.start..r.end);
                             let key = repository::Key::from_i128(key);
                             let (rel_tag, block_no) = key_to_rel_block(key)
                                 .expect("we filter non-rel-block keys out above");
-                            (r, RelTagBlockNo { rel_tag, block_no })
+                            PagestreamGetPageRequest {
+                                latest: rng.gen_bool(args.req_latest_probability),
+                                lsn: r.timeline_lsn,
+                                rel: rel_tag,
+                                blkno: block_no,
+                            }
                         };
-                        sender.send((key, range.timeline_lsn)).await.ok().unwrap();
+                        sender.send(req).await.ok().unwrap();
                     }
                 })
             };
@@ -303,7 +319,7 @@ async fn client(
     args: &'static Args,
     timeline: TenantTimelineId,
     start_work_barrier: Arc<Barrier>,
-    mut work: tokio::sync::mpsc::Receiver<(RelTagBlockNo, Lsn)>,
+    mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
     all_work_done_barrier: Arc<Barrier>,
     live_stats: Arc<LiveStats>,
 ) {
@@ -317,10 +333,10 @@ async fn client(
         .await
         .unwrap();
 
-    while let Some((key, lsn)) = work.recv().await {
+    while let Some(req) = work.recv().await {
         let start = Instant::now();
         client
-            .getpage(key, lsn)
+            .getpage(req)
             .await
             .with_context(|| format!("getpage for {timeline}"))
             .unwrap();
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index ed452eae7d..7e5ae892ad 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -23,6 +23,7 @@ use tracing::*;
 use tokio_tar::{Builder, EntryType, Header};
 
 use crate::context::RequestContext;
+use crate::pgdatadir_mapping::Version;
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};
 
@@ -174,7 +175,7 @@ where
         ] {
             for segno in self
                 .timeline
-                .list_slru_segments(kind, self.lsn, self.ctx)
+                .list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
                 .await?
             {
                 self.add_slru_segment(kind, segno).await?;
@@ -192,7 +193,7 @@ where
             // Otherwise only include init forks of unlogged relations.
             let rels = self
                 .timeline
-                .list_rels(spcnode, dbnode, self.lsn, self.ctx)
+                .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
                 .await?;
             for &rel in rels.iter() {
                 // Send init fork as main fork to provide well formed empty
@@ -267,7 +268,7 @@ where
     async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
         let nblocks = self
             .timeline
-            .get_rel_size(src, self.lsn, false, self.ctx)
+            .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
             .await?;
 
         // If the relation is empty, create an empty file
@@ -288,7 +289,7 @@ where
             for blknum in startblk..endblk {
                 let img = self
                     .timeline
-                    .get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
+                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
                     .await?;
                 segment_data.extend_from_slice(&img[..]);
             }
@@ -310,7 +311,7 @@ where
     async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
         let nblocks = self
             .timeline
-            .get_slru_segment_size(slru, segno, self.lsn, self.ctx)
+            .get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
             .await?;
 
         let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
@@ -352,7 +353,7 @@ where
         let relmap_img = if has_relmap_file {
             let img = self
                 .timeline
-                .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
+                .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
                 .await?;
 
             ensure!(
@@ -399,7 +400,7 @@ where
             if !has_relmap_file
                 && self
                     .timeline
-                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
+                    .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
                     .await?
                     .is_empty()
             {
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 8516f397ca..7c03dc1bdd 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -37,8 +37,8 @@ use crate::tenant::{
     TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
-    TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
+    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };
 
 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -75,6 +75,9 @@ pub mod defaults {
     pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
 
     pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
+    pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
+
+    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 
     ///
     /// Default built-in configuration file.
@@ -88,6 +91,7 @@ pub mod defaults {
 #wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
 #wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'
 
+#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE}
 #max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}
 
 # initial superuser role name to use when creating a new tenant
@@ -108,6 +112,8 @@ pub mod defaults {
 
 #background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
 
+#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -125,6 +131,7 @@ pub mod defaults {
 #gc_feedback = false
 
 #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
+#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}
 
 [remote_storage]
 
@@ -233,6 +240,13 @@ pub struct PageServerConf {
     /// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
     /// heatmap uploads vs. other remote storage operations.
     pub heatmap_upload_concurrency: usize,
+
+    /// How many remote storage downloads may be done for secondary tenants concurrently.  Implicitly
+    /// deprioritises secondary downloads vs. remote storage operations for attached tenants.
+    pub secondary_download_concurrency: usize,
+
+    /// Maximum number of WAL records to be ingested and committed at the same time
+    pub ingest_batch_size: u64,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -314,6 +328,9 @@ struct PageServerConfigBuilder {
     control_plane_emergency_mode: BuilderValue<bool>,
 
     heatmap_upload_concurrency: BuilderValue<usize>,
+    secondary_download_concurrency: BuilderValue<usize>,
+
+    ingest_batch_size: BuilderValue<u64>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -386,6 +403,9 @@ impl Default for PageServerConfigBuilder {
             control_plane_emergency_mode: Set(false),
 
             heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
+            secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
+
+            ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
         }
     }
 }
@@ -534,6 +554,14 @@ impl PageServerConfigBuilder {
         self.heatmap_upload_concurrency = BuilderValue::Set(value)
     }
 
+    pub fn secondary_download_concurrency(&mut self, value: usize) {
+        self.secondary_download_concurrency = BuilderValue::Set(value)
+    }
+
+    pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
+        self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let concurrent_tenant_warmup = self
             .concurrent_tenant_warmup
@@ -632,10 +660,15 @@ impl PageServerConfigBuilder {
             control_plane_emergency_mode: self
                 .control_plane_emergency_mode
                 .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
-
             heatmap_upload_concurrency: self
                 .heatmap_upload_concurrency
                 .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
+            secondary_download_concurrency: self
+                .secondary_download_concurrency
+                .ok_or(anyhow!("missing secondary_download_concurrency"))?,
+            ingest_batch_size: self
+                .ingest_batch_size
+                .ok_or(anyhow!("missing ingest_batch_size"))?,
         })
     }
 }
@@ -693,6 +726,11 @@ impl PageServerConf {
             .join(TENANT_LOCATION_CONFIG_NAME)
     }
 
+    pub(crate) fn tenant_heatmap_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+        self.tenant_path(tenant_shard_id)
+            .join(TENANT_HEATMAP_BASENAME)
+    }
+
     pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
         self.tenant_path(tenant_shard_id)
             .join(TIMELINES_SEGMENT_NAME)
@@ -878,6 +916,10 @@ impl PageServerConf {
                 "heatmap_upload_concurrency" => {
                     builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
                 },
+                "secondary_download_concurrency" => {
+                    builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
+                },
+                "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -949,6 +991,8 @@ impl PageServerConf {
             control_plane_api_token: None,
             control_plane_emergency_mode: false,
             heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
+            secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
+            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
         }
     }
 }
@@ -1177,7 +1221,9 @@ background_task_maximum_delay = '334 s'
                 control_plane_api: None,
                 control_plane_api_token: None,
                 control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
+                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
+                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1238,7 +1284,9 @@ background_task_maximum_delay = '334 s'
                 control_plane_api: None,
                 control_plane_api_token: None,
                 control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
+                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
+                ingest_batch_size: 100,
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 157e6b4e3e..5c7747d353 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -152,6 +152,7 @@ impl From<PageReconstructError> for ApiError {
             PageReconstructError::AncestorStopping(_) => {
                 ApiError::ResourceUnavailable(format!("{pre}").into())
             }
+            PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
             PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
         }
     }
@@ -1273,6 +1274,23 @@ async fn put_tenant_location_config_handler(
         // which is not a 400 but a 409.
         .map_err(ApiError::BadRequest)?;
 
+    if let Some(_flush_ms) = flush {
+        match state
+            .secondary_controller
+            .upload_tenant(tenant_shard_id)
+            .await
+        {
+            Ok(()) => {
+                tracing::info!("Uploaded heatmap during flush");
+            }
+            Err(e) => {
+                tracing::warn!("Failed to flush heatmap: {e}");
+            }
+        }
+    } else {
+        tracing::info!("No flush requested when configuring");
+    }
+
     json_response(StatusCode::OK, ())
 }
 
@@ -1610,6 +1628,21 @@ async fn secondary_upload_handler(
     json_response(StatusCode::OK, ())
 }
 
+async fn secondary_download_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    state
+        .secondary_controller
+        .download_tenant(tenant_shard_id)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(
         StatusCode::NOT_FOUND,
@@ -1878,6 +1911,9 @@ pub fn make_router(
         .put("/v1/deletion_queue/flush", |r| {
             api_handler(r, deletion_queue_flush)
         })
+        .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
+            api_handler(r, secondary_download_handler)
+        })
         .put("/v1/tenant/:tenant_shard_id/break", |r| {
             testing_api_handler("set tenant state to broken", r, handle_tenant_break)
         })
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index d95d75449d..d66df36b3a 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -21,6 +21,7 @@ use tracing::*;
 use walkdir::WalkDir;
 
 use crate::context::RequestContext;
+use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::Timeline;
@@ -312,13 +313,16 @@ async fn import_wal(
         waldecoder.feed_bytes(&buf);
 
         let mut nrecords = 0;
-        let mut modification = tline.begin_modification(endpoint);
+        let mut modification = tline.begin_modification(last_lsn);
         let mut decoded = DecodedWALRecord::default();
         while last_lsn <= endpoint {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 walingest
                     .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                     .await?;
+                WAL_INGEST.records_committed.inc();
+
+                modification.commit(ctx).await?;
                 last_lsn = lsn;
 
                 nrecords += 1;
@@ -448,13 +452,14 @@ pub async fn import_wal_from_tar(
 
         waldecoder.feed_bytes(&bytes[offset..]);
 
-        let mut modification = tline.begin_modification(end_lsn);
+        let mut modification = tline.begin_modification(last_lsn);
         let mut decoded = DecodedWALRecord::default();
         while last_lsn <= end_lsn {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 walingest
                     .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                     .await?;
+                modification.commit(ctx).await?;
                 last_lsn = lsn;
 
                 debug!("imported record at {} (end {})", lsn, end_lsn);
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index c1ce0af47b..26070e0cc1 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -117,6 +117,10 @@ pub const TENANT_CONFIG_NAME: &str = "config";
 /// Full path: `tenants/<tenant_id>/config`.
 pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
 
+/// Per-tenant copy of their remote heatmap, downloaded into the local
+/// tenant path while in secondary mode.
+pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
+
 /// A suffix used for various temporary files. Any temporary files found in the
 /// data directory at pageserver startup can be automatically removed.
 pub const TEMP_FILE_SUFFIX: &str = "___temp";
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 4725903783..6f4431c3cf 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -29,7 +29,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
 // Metrics collected on operations on the storage repository.
 #[derive(Debug, EnumVariantNames, IntoStaticStr)]
 #[strum(serialize_all = "kebab_case")]
-pub enum StorageTimeOperation {
+pub(crate) enum StorageTimeOperation {
     #[strum(serialize = "layer flush")]
     LayerFlush,
 
@@ -55,7 +55,7 @@ pub enum StorageTimeOperation {
     CreateTenant,
 }
 
-pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
+pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
     register_counter_vec!(
         "pageserver_storage_operations_seconds_sum",
         "Total time spent on storage operations with operation, tenant and timeline dimensions",
@@ -64,7 +64,7 @@ pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "pageserver_storage_operations_seconds_count",
         "Count of storage operations with operation, tenant and timeline dimensions",
@@ -150,7 +150,7 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub struct PageCacheMetricsForTaskKind {
+pub(crate) struct PageCacheMetricsForTaskKind {
     pub read_accesses_materialized_page: IntCounter,
     pub read_accesses_immutable: IntCounter,
 
@@ -159,7 +159,7 @@ pub struct PageCacheMetricsForTaskKind {
     pub read_hits_materialized_page_older_lsn: IntCounter,
 }
 
-pub struct PageCacheMetrics {
+pub(crate) struct PageCacheMetrics {
     map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
 }
 
@@ -181,7 +181,7 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
+pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
     map: EnumMap::from_array(std::array::from_fn(|task_kind| {
         let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
         let task_kind: &'static str = task_kind.into();
@@ -243,10 +243,9 @@ impl PageCacheMetrics {
     }
 }
 
-pub struct PageCacheSizeMetrics {
+pub(crate) struct PageCacheSizeMetrics {
     pub max_bytes: UIntGauge,
 
-    pub current_bytes_ephemeral: UIntGauge,
     pub current_bytes_immutable: UIntGauge,
     pub current_bytes_materialized_page: UIntGauge,
 }
@@ -260,31 +259,26 @@ static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
-    max_bytes: {
-        register_uint_gauge!(
-            "pageserver_page_cache_size_max_bytes",
-            "Maximum size of the page cache in bytes"
-        )
-        .expect("failed to define a metric")
-    },
-
-    current_bytes_ephemeral: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["ephemeral"])
-            .unwrap()
-    },
-    current_bytes_immutable: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["immutable"])
-            .unwrap()
-    },
-    current_bytes_materialized_page: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["materialized_page"])
-            .unwrap()
-    },
-});
+pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
+    Lazy::new(|| PageCacheSizeMetrics {
+        max_bytes: {
+            register_uint_gauge!(
+                "pageserver_page_cache_size_max_bytes",
+                "Maximum size of the page cache in bytes"
+            )
+            .expect("failed to define a metric")
+        },
+        current_bytes_immutable: {
+            PAGE_CACHE_SIZE_CURRENT_BYTES
+                .get_metric_with_label_values(&["immutable"])
+                .unwrap()
+        },
+        current_bytes_materialized_page: {
+            PAGE_CACHE_SIZE_CURRENT_BYTES
+                .get_metric_with_label_values(&["materialized_page"])
+                .unwrap()
+        },
+    });
 
 pub(crate) mod page_cache_eviction_metrics {
     use std::num::NonZeroUsize;
@@ -740,13 +734,13 @@ pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {
 
 /// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
-pub struct EvictionsWithLowResidenceDuration {
+pub(crate) struct EvictionsWithLowResidenceDuration {
     data_source: &'static str,
     threshold: Duration,
     counter: Option<IntCounter>,
 }
 
-pub struct EvictionsWithLowResidenceDurationBuilder {
+pub(crate) struct EvictionsWithLowResidenceDurationBuilder {
     data_source: &'static str,
     threshold: Duration,
 }
@@ -1009,7 +1003,7 @@ pub enum SmgrQueryType {
 }
 
 #[derive(Debug)]
-pub struct SmgrQueryTimePerTimeline {
+pub(crate) struct SmgrQueryTimePerTimeline {
     metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
 }
 
@@ -1181,8 +1175,8 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
     .map(|ms| (ms as f64) / 1000.0)
 });
 
-pub struct BasebackupQueryTime(HistogramVec);
-pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
+pub(crate) struct BasebackupQueryTime(HistogramVec);
+pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
     BasebackupQueryTime({
         register_histogram_vec!(
             "pageserver_basebackup_query_seconds",
@@ -1202,7 +1196,7 @@ impl DurationResultObserver for BasebackupQueryTime {
     }
 }
 
-pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
+pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_live_connections",
         "Number of live network connections",
@@ -1369,6 +1363,8 @@ pub(crate) struct SecondaryModeMetrics {
     pub(crate) upload_heatmap: IntCounter,
     pub(crate) upload_heatmap_errors: IntCounter,
     pub(crate) upload_heatmap_duration: Histogram,
+    pub(crate) download_heatmap: IntCounter,
+    pub(crate) download_layer: IntCounter,
 }
 pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
     upload_heatmap: register_int_counter!(
@@ -1386,6 +1382,16 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
         "Time to build and upload a heatmap, including any waiting inside the S3 client"
     )
     .expect("failed to define a metric"),
+    download_heatmap: register_int_counter!(
+        "pageserver_secondary_download_heatmap",
+        "Number of downloads of heatmaps by secondary mode locations"
+    )
+    .expect("failed to define a metric"),
+    download_layer: register_int_counter!(
+        "pageserver_secondary_download_layer",
+        "Number of downloads of layers by secondary mode locations"
+    )
+    .expect("failed to define a metric"),
 });
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1655,7 +1661,7 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
     Lazy::new(WalRedoProcessCounters::default);
 
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
-pub struct StorageTimeMetricsTimer {
+pub(crate) struct StorageTimeMetricsTimer {
     metrics: StorageTimeMetrics,
     start: Instant,
 }
@@ -1680,7 +1686,7 @@ impl StorageTimeMetricsTimer {
 /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
 /// timeline total sum and count.
 #[derive(Clone, Debug)]
-pub struct StorageTimeMetrics {
+pub(crate) struct StorageTimeMetrics {
     /// Sum of f64 seconds, per operation, tenant_id and timeline_id
     timeline_sum: Counter,
     /// Number of oeprations, per operation, tenant_id and timeline_id
@@ -1719,7 +1725,7 @@ impl StorageTimeMetrics {
 }
 
 #[derive(Debug)]
-pub struct TimelineMetrics {
+pub(crate) struct TimelineMetrics {
     tenant_id: String,
     shard_id: String,
     timeline_id: String,
@@ -1927,7 +1933,7 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge {
     }
 }
 
-pub struct RemoteTimelineClientMetrics {
+pub(crate) struct RemoteTimelineClientMetrics {
     tenant_id: String,
     timeline_id: String,
     remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
@@ -2225,7 +2231,7 @@ impl Drop for RemoteTimelineClientMetrics {
 
 /// Wrapper future that measures the time spent by a remote storage operation,
 /// and records the time and success/failure as a prometheus metric.
-pub trait MeasureRemoteOp: Sized {
+pub(crate) trait MeasureRemoteOp: Sized {
     fn measure_remote_op(
         self,
         tenant_id: TenantId,
@@ -2250,7 +2256,7 @@ pub trait MeasureRemoteOp: Sized {
 impl<T: Sized> MeasureRemoteOp for T {}
 
 pin_project! {
-    pub struct MeasuredRemoteOp<F>
+    pub(crate) struct MeasuredRemoteOp<F>
     {
         #[pin]
         inner: F,
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index d5ca7f7382..291490d016 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -25,6 +25,7 @@ use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, Qu
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
+use std::borrow::Cow;
 use std::io;
 use std::net::TcpListener;
 use std::pin::pin;
@@ -53,7 +54,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
-use crate::pgdatadir_mapping::rel_block_to_key;
+use crate::pgdatadir_mapping::{rel_block_to_key, Version};
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -61,6 +62,9 @@ use crate::tenant::mgr;
 use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::ShardSelector;
+use crate::tenant::timeline::WaitLsnError;
+use crate::tenant::GetTimelineError;
+use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;
 
@@ -283,6 +287,64 @@ struct PageServerHandler {
     connection_ctx: RequestContext,
 }
 
+#[derive(thiserror::Error, Debug)]
+enum PageStreamError {
+    /// We encountered an error that should prompt the client to reconnect:
+    /// in practice this means we drop the connection without sending a response.
+    #[error("Reconnect required: {0}")]
+    Reconnect(Cow<'static, str>),
+
+    /// We were instructed to shutdown while processing the query
+    #[error("Shutting down")]
+    Shutdown,
+
+    /// Something went wrong reading a page: this likely indicates a pageserver bug
+    #[error("Read error: {0}")]
+    Read(PageReconstructError),
+
+    /// Ran out of time waiting for an LSN
+    #[error("LSN timeout: {0}")]
+    LsnTimeout(WaitLsnError),
+
+    /// The entity required to serve the request (tenant or timeline) is not found,
+    /// or is not found in a suitable state to serve a request.
+    #[error("Not found: {0}")]
+    NotFound(std::borrow::Cow<'static, str>),
+
+    /// Request asked for something that doesn't make sense, like an invalid LSN
+    #[error("Bad request: {0}")]
+    BadRequest(std::borrow::Cow<'static, str>),
+}
+
+impl From<PageReconstructError> for PageStreamError {
+    fn from(value: PageReconstructError) -> Self {
+        match value {
+            PageReconstructError::Cancelled => Self::Shutdown,
+            e => Self::Read(e),
+        }
+    }
+}
+
+impl From<GetActiveTimelineError> for PageStreamError {
+    fn from(value: GetActiveTimelineError) -> Self {
+        match value {
+            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown,
+            GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()),
+            GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()),
+        }
+    }
+}
+
+impl From<WaitLsnError> for PageStreamError {
+    fn from(value: WaitLsnError) -> Self {
+        match value {
+            e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
+            WaitLsnError::Shutdown => Self::Shutdown,
+            WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
+        }
+    }
+}
+
 impl PageServerHandler {
     pub fn new(
         conf: &'static PageServerConf,
@@ -428,7 +490,7 @@ impl PageServerHandler {
         // Check that the timeline exists
         let timeline = tenant
             .get_timeline(timeline_id, true)
-            .map_err(|e| anyhow::anyhow!(e))?;
+            .map_err(|e| QueryError::NotFound(format!("{e}").into()))?;
 
         // Avoid starting new requests if the timeline has already started shutting down,
         // and block timeline shutdown until this request is complete, or drops out due
@@ -520,32 +582,44 @@ impl PageServerHandler {
                 }
             };
 
-            if let Err(e) = &response {
-                // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
-                // because wait_lsn etc will drop out
-                // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
-                // is_canceled(): [`Timeline::shutdown`]` has entered
-                if timeline.cancel.is_cancelled() || timeline.is_stopping() {
+            match response {
+                Err(PageStreamError::Shutdown) => {
                     // If we fail to fulfil a request during shutdown, which may be _because_ of
                     // shutdown, then do not send the error to the client.  Instead just drop the
                     // connection.
-                    span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
+                    span.in_scope(|| info!("dropping connection due to shutdown"));
                     return Err(QueryError::Shutdown);
                 }
+                Err(PageStreamError::Reconnect(reason)) => {
+                    span.in_scope(|| info!("handler requested reconnect: {reason}"));
+                    return Err(QueryError::Reconnect);
+                }
+                Err(e) if timeline.cancel.is_cancelled() || timeline.is_stopping() => {
+                    // This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
+                    // shutdown error, this may be buried inside a PageReconstructError::Other for example.
+                    //
+                    // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
+                    // because wait_lsn etc will drop out
+                    // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
+                    // is_canceled(): [`Timeline::shutdown`]` has entered
+                    span.in_scope(|| info!("dropped error response during shutdown: {e:#}"));
+                    return Err(QueryError::Shutdown);
+                }
+                r => {
+                    let response_msg = r.unwrap_or_else(|e| {
+                        // print the all details to the log with {:#}, but for the client the
+                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
+                        // here includes cancellation which is not an error.
+                        span.in_scope(|| error!("error reading relation or page version: {:#}", e));
+                        PagestreamBeMessage::Error(PagestreamErrorResponse {
+                            message: e.to_string(),
+                        })
+                    });
+
+                    pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
+                    self.flush_cancellable(pgb, &timeline.cancel).await?;
+                }
             }
-
-            let response = response.unwrap_or_else(|e| {
-                // print the all details to the log with {:#}, but for the client the
-                // error message is enough.  Do not log if shutting down, as the anyhow::Error
-                // here includes cancellation which is not an error.
-                span.in_scope(|| error!("error reading relation or page version: {:#}", e));
-                PagestreamBeMessage::Error(PagestreamErrorResponse {
-                    message: e.to_string(),
-                })
-            });
-
-            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
-            self.flush_cancellable(pgb, &timeline.cancel).await?;
         }
         Ok(())
     }
@@ -692,7 +766,7 @@ impl PageServerHandler {
         latest: bool,
         latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Lsn> {
+    ) -> Result<Lsn, PageStreamError> {
         if latest {
             // Latest page version was requested. If LSN is given, it is a hint
             // to the page server that there have been no modifications to the
@@ -723,15 +797,19 @@ impl PageServerHandler {
             }
         } else {
             if lsn == Lsn(0) {
-                anyhow::bail!("invalid LSN(0) in request");
+                return Err(PageStreamError::BadRequest(
+                    "invalid LSN(0) in request".into(),
+                ));
             }
             timeline.wait_lsn(lsn, ctx).await?;
         }
-        anyhow::ensure!(
-            lsn >= **latest_gc_cutoff_lsn,
-            "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
-            lsn, **latest_gc_cutoff_lsn
-        );
+
+        if lsn < **latest_gc_cutoff_lsn {
+            return Err(PageStreamError::BadRequest(format!(
+                "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
+                lsn, **latest_gc_cutoff_lsn
+            ).into()));
+        }
         Ok(lsn)
     }
 
@@ -740,14 +818,14 @@ impl PageServerHandler {
         timeline: &Timeline,
         req: &PagestreamExistsRequest,
         ctx: &RequestContext,
-    ) -> anyhow::Result<PagestreamBeMessage> {
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
             Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                 .await?;
 
         let exists = timeline
-            .get_rel_exists(req.rel, lsn, req.latest, ctx)
+            .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
             .await?;
 
         Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -760,13 +838,15 @@ impl PageServerHandler {
         timeline: &Timeline,
         req: &PagestreamNblocksRequest,
         ctx: &RequestContext,
-    ) -> anyhow::Result<PagestreamBeMessage> {
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
             Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                 .await?;
 
-        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;
+        let n_blocks = timeline
+            .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
+            .await?;
 
         Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
             n_blocks,
@@ -778,14 +858,20 @@ impl PageServerHandler {
         timeline: &Timeline,
         req: &PagestreamDbSizeRequest,
         ctx: &RequestContext,
-    ) -> anyhow::Result<PagestreamBeMessage> {
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn =
             Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                 .await?;
 
         let total_blocks = timeline
-            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
+            .get_db_size(
+                DEFAULTTABLESPACE_OID,
+                req.dbnode,
+                Version::Lsn(lsn),
+                req.latest,
+                ctx,
+            )
             .await?;
         let db_size = total_blocks as i64 * BLCKSZ as i64;
 
@@ -794,30 +880,35 @@ impl PageServerHandler {
         }))
     }
 
+    async fn do_handle_get_page_at_lsn_request(
+        &self,
+        timeline: &Timeline,
+        req: &PagestreamGetPageRequest,
+        ctx: &RequestContext,
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
+        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let lsn =
+            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
+                .await?;
+        let page = timeline
+            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
+            .await?;
+
+        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
+            page,
+        }))
+    }
+
     async fn handle_get_page_at_lsn_request(
         &self,
         timeline: &Timeline,
         req: &PagestreamGetPageRequest,
         ctx: &RequestContext,
-    ) -> anyhow::Result<PagestreamBeMessage> {
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
-        let lsn =
-            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
-                .await?;
-        /*
-        // Add a 1s delay to some requests. The delay helps the requests to
-        // hit the race condition from github issue #1047 more easily.
-        use rand::Rng;
-        if rand::thread_rng().gen::<u8>() < 5 {
-            std::thread::sleep(std::time::Duration::from_millis(1000));
-        }
-        */
-
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
         let key = rel_block_to_key(req.rel, req.blkno);
-        let page = if timeline.get_shard_identity().is_key_local(&key) {
-            timeline
-                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
-                .await?
+        if timeline.get_shard_identity().is_key_local(&key) {
+            self.do_handle_get_page_at_lsn_request(timeline, req, ctx)
+                .await
         } else {
             // The Tenant shard we looked up at connection start does not hold this particular
             // key: look for other shards in this tenant.  This scenario occurs if a pageserver
@@ -836,30 +927,30 @@ impl PageServerHandler {
                 Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
                     // We already know this tenant exists in general, because we resolved it at
                     // start of connection.  Getting a NotFound here indicates that the shard containing
-                    // the requested page is not present on this node.
-
-                    // TODO: this should be some kind of structured error that the client will understand,
-                    // so that it can block until its config is updated: this error is expected in the case
-                    // that the Tenant's shards' placements are being updated and the client hasn't been
-                    // informed yet.
-                    //
-                    // https://github.com/neondatabase/neon/issues/6038
-                    return Err(anyhow::anyhow!("Request routed to wrong shard"));
+                    // the requested page is not present on this node: the client's knowledge of shard->pageserver
+                    // mapping is out of date.
+                    tracing::info!("Page request routed to wrong shard: my identity {:?}, should go to shard {}, key {}",
+                        timeline.get_shard_identity(), timeline.get_shard_identity().get_shard_number(&key).0, key);
+                    // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
+                    // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
+                    // and talk to a different pageserver.
+                    return Err(PageStreamError::Reconnect(
+                        "getpage@lsn request routed to wrong shard".into(),
+                    ));
                 }
                 Err(e) => return Err(e.into()),
             };
 
             // Take a GateGuard for the duration of this request.  If we were using our main Timeline object,
             // the GateGuard was already held over the whole connection.
-            let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
-            timeline
-                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
-                .await?
-        };
+            let _timeline_guard = timeline
+                .gate
+                .enter()
+                .map_err(|_| PageStreamError::Shutdown)?;
 
-        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
-            page,
-        }))
+            self.do_handle_get_page_at_lsn_request(&timeline, req, ctx)
+                .await
+        }
     }
 
     #[allow(clippy::too_many_arguments)]
@@ -1000,9 +1091,7 @@ impl PageServerHandler {
         )
         .await
         .map_err(GetActiveTimelineError::Tenant)?;
-        let timeline = tenant
-            .get_timeline(timeline_id, true)
-            .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
+        let timeline = tenant.get_timeline(timeline_id, true)?;
         Ok(timeline)
     }
 }
@@ -1424,14 +1513,15 @@ enum GetActiveTimelineError {
     #[error(transparent)]
     Tenant(GetActiveTenantError),
     #[error(transparent)]
-    Timeline(anyhow::Error),
+    Timeline(#[from] GetTimelineError),
 }
 
 impl From<GetActiveTimelineError> for QueryError {
     fn from(e: GetActiveTimelineError) -> Self {
         match e {
+            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => QueryError::Shutdown,
             GetActiveTimelineError::Tenant(e) => e.into(),
-            GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
+            GetActiveTimelineError::Timeline(e) => QueryError::NotFound(format!("{e}").into()),
         }
     }
 }
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index e9884a15f5..f11a72f2ab 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -11,7 +11,7 @@ use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
-use anyhow::Context;
+use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes};
 use pageserver_api::key::is_rel_block_key;
 use pageserver_api::reltag::{RelTag, SlruKind};
@@ -147,6 +147,7 @@ impl Timeline {
     {
         DatadirModification {
             tline: self,
+            pending_lsns: Vec::new(),
             pending_updates: HashMap::new(),
             pending_deletions: Vec::new(),
             pending_nblocks: 0,
@@ -159,11 +160,11 @@ impl Timeline {
     //------------------------------------------------------------------------------
 
     /// Look up given page version.
-    pub async fn get_rel_page_at_lsn(
+    pub(crate) async fn get_rel_page_at_lsn(
         &self,
         tag: RelTag,
         blknum: BlockNumber,
-        lsn: Lsn,
+        version: Version<'_>,
         latest: bool,
         ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
@@ -173,44 +174,47 @@ impl Timeline {
             ));
         }
 
-        let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
+        let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
         if blknum >= nblocks {
             debug!(
                 "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
-                tag, blknum, lsn, nblocks
+                tag,
+                blknum,
+                version.get_lsn(),
+                nblocks
             );
             return Ok(ZERO_PAGE.clone());
         }
 
         let key = rel_block_to_key(tag, blknum);
-        self.get(key, lsn, ctx).await
+        version.get(self, key, ctx).await
     }
 
     // Get size of a database in blocks
-    pub async fn get_db_size(
+    pub(crate) async fn get_db_size(
         &self,
         spcnode: Oid,
         dbnode: Oid,
-        lsn: Lsn,
+        version: Version<'_>,
         latest: bool,
         ctx: &RequestContext,
     ) -> Result<usize, PageReconstructError> {
         let mut total_blocks = 0;
 
-        let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;
+        let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;
 
         for rel in rels {
-            let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
+            let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
             total_blocks += n_blocks as usize;
         }
         Ok(total_blocks)
     }
 
     /// Get size of a relation file
-    pub async fn get_rel_size(
+    pub(crate) async fn get_rel_size(
         &self,
         tag: RelTag,
-        lsn: Lsn,
+        version: Version<'_>,
         latest: bool,
         ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
@@ -220,12 +224,12 @@ impl Timeline {
             ));
         }
 
-        if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
+        if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
             return Ok(nblocks);
         }
 
         if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, lsn, latest, ctx).await?
+            && !self.get_rel_exists(tag, version, latest, ctx).await?
         {
             // FIXME: Postgres sometimes calls smgrcreate() to create
             // FSM, and smgrnblocks() on it immediately afterwards,
@@ -235,7 +239,7 @@ impl Timeline {
         }
 
         let key = rel_size_to_key(tag);
-        let mut buf = self.get(key, lsn, ctx).await?;
+        let mut buf = version.get(self, key, ctx).await?;
         let nblocks = buf.get_u32_le();
 
         if latest {
@@ -246,16 +250,16 @@ impl Timeline {
             // latest=true, then it can not cause cache corruption, because with latest=true
             // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
             // associated with most recent value of LSN.
-            self.update_cached_rel_size(tag, lsn, nblocks);
+            self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
         }
         Ok(nblocks)
     }
 
     /// Does relation exist?
-    pub async fn get_rel_exists(
+    pub(crate) async fn get_rel_exists(
         &self,
         tag: RelTag,
-        lsn: Lsn,
+        version: Version<'_>,
         _latest: bool,
         ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
@@ -266,12 +270,12 @@ impl Timeline {
         }
 
         // first try to lookup relation in cache
-        if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
+        if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
             return Ok(true);
         }
         // fetch directory listing
         let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;
 
         match RelDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -287,16 +291,16 @@ impl Timeline {
     /// # Cancel-Safety
     ///
     /// This method is cancellation-safe.
-    pub async fn list_rels(
+    pub(crate) async fn list_rels(
         &self,
         spcnode: Oid,
         dbnode: Oid,
-        lsn: Lsn,
+        version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<HashSet<RelTag>, PageReconstructError> {
         // fetch directory listing
         let key = rel_dir_to_key(spcnode, dbnode);
-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;
 
         match RelDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -315,7 +319,7 @@ impl Timeline {
     }
 
     /// Look up given SLRU page version.
-    pub async fn get_slru_page_at_lsn(
+    pub(crate) async fn get_slru_page_at_lsn(
         &self,
         kind: SlruKind,
         segno: u32,
@@ -328,29 +332,29 @@ impl Timeline {
     }
 
     /// Get size of an SLRU segment
-    pub async fn get_slru_segment_size(
+    pub(crate) async fn get_slru_segment_size(
         &self,
         kind: SlruKind,
         segno: u32,
-        lsn: Lsn,
+        version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
         let key = slru_segment_size_to_key(kind, segno);
-        let mut buf = self.get(key, lsn, ctx).await?;
+        let mut buf = version.get(self, key, ctx).await?;
         Ok(buf.get_u32_le())
     }
 
     /// Get size of an SLRU segment
-    pub async fn get_slru_segment_exists(
+    pub(crate) async fn get_slru_segment_exists(
         &self,
         kind: SlruKind,
         segno: u32,
-        lsn: Lsn,
+        version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
         // fetch directory listing
         let key = slru_dir_to_key(kind);
-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;
 
         match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => {
@@ -368,7 +372,7 @@ impl Timeline {
     /// so it's not well defined which LSN you get if there were multiple commits
     /// "in flight" at that point in time.
     ///
-    pub async fn find_lsn_for_timestamp(
+    pub(crate) async fn find_lsn_for_timestamp(
         &self,
         search_timestamp: TimestampTz,
         cancel: &CancellationToken,
@@ -448,7 +452,7 @@ impl Timeline {
     /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
     /// with a smaller/larger timestamp.
     ///
-    pub async fn is_latest_commit_timestamp_ge_than(
+    pub(crate) async fn is_latest_commit_timestamp_ge_than(
         &self,
         search_timestamp: TimestampTz,
         probe_lsn: Lsn,
@@ -471,7 +475,7 @@ impl Timeline {
     /// Obtain the possible timestamp range for the given lsn.
     ///
     /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
-    pub async fn get_timestamp_for_lsn(
+    pub(crate) async fn get_timestamp_for_lsn(
         &self,
         probe_lsn: Lsn,
         ctx: &RequestContext,
@@ -501,11 +505,11 @@ impl Timeline {
         mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
     ) -> Result<T, PageReconstructError> {
         for segno in self
-            .list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
+            .list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
             .await?
         {
             let nblocks = self
-                .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
+                .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
                 .await?;
             for blknum in (0..nblocks).rev() {
                 let clog_page = self
@@ -528,36 +532,36 @@ impl Timeline {
     }
 
     /// Get a list of SLRU segments
-    pub async fn list_slru_segments(
+    pub(crate) async fn list_slru_segments(
         &self,
         kind: SlruKind,
-        lsn: Lsn,
+        version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<HashSet<u32>, PageReconstructError> {
         // fetch directory entry
         let key = slru_dir_to_key(kind);
 
-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;
         match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
             Ok(dir) => Ok(dir.segments),
             Err(e) => Err(PageReconstructError::from(e)),
         }
     }
 
-    pub async fn get_relmap_file(
+    pub(crate) async fn get_relmap_file(
         &self,
         spcnode: Oid,
         dbnode: Oid,
-        lsn: Lsn,
+        version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
         let key = relmap_file_key(spcnode, dbnode);
 
-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;
         Ok(buf)
     }
 
-    pub async fn list_dbdirs(
+    pub(crate) async fn list_dbdirs(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
@@ -571,7 +575,7 @@ impl Timeline {
         }
     }
 
-    pub async fn get_twophase_file(
+    pub(crate) async fn get_twophase_file(
         &self,
         xid: TransactionId,
         lsn: Lsn,
@@ -582,7 +586,7 @@ impl Timeline {
         Ok(buf)
     }
 
-    pub async fn list_twophase_files(
+    pub(crate) async fn list_twophase_files(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
@@ -596,7 +600,7 @@ impl Timeline {
         }
     }
 
-    pub async fn get_control_file(
+    pub(crate) async fn get_control_file(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
@@ -604,7 +608,7 @@ impl Timeline {
         self.get(CONTROLFILE_KEY, lsn, ctx).await
     }
 
-    pub async fn get_checkpoint(
+    pub(crate) async fn get_checkpoint(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
@@ -612,7 +616,7 @@ impl Timeline {
         self.get(CHECKPOINT_KEY, lsn, ctx).await
     }
 
-    pub async fn list_aux_files(
+    pub(crate) async fn list_aux_files(
         &self,
         lsn: Lsn,
         ctx: &RequestContext,
@@ -652,7 +656,10 @@ impl Timeline {
 
         let mut total_size: u64 = 0;
         for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
+            for rel in self
+                .list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
+                .await?
+            {
                 if self.cancel.is_cancelled() {
                     return Err(CalculateLogicalSizeError::Cancelled);
                 }
@@ -692,7 +699,7 @@ impl Timeline {
             result.add_key(rel_dir_to_key(spcnode, dbnode));
 
             let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, lsn, ctx)
+                .list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
                 .await?
                 .into_iter()
                 .collect();
@@ -799,18 +806,39 @@ pub struct DatadirModification<'a> {
     /// in the state in 'tline' yet.
     pub tline: &'a Timeline,
 
-    /// Lsn assigned by begin_modification
-    pub lsn: Lsn,
+    /// Current LSN of the modification
+    lsn: Lsn,
 
     // The modifications are not applied directly to the underlying key-value store.
     // The put-functions add the modifications here, and they are flushed to the
     // underlying key-value store by the 'finish' function.
-    pending_updates: HashMap<Key, Value>,
-    pending_deletions: Vec<Range<Key>>,
+    pending_lsns: Vec<Lsn>,
+    pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
+    pending_deletions: Vec<(Range<Key>, Lsn)>,
     pending_nblocks: i64,
 }
 
 impl<'a> DatadirModification<'a> {
+    /// Get the current lsn
+    pub(crate) fn get_lsn(&self) -> Lsn {
+        self.lsn
+    }
+
+    /// Set the current lsn
+    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
+        ensure!(
+            lsn >= self.lsn,
+            "setting an older lsn {} than {} is not allowed",
+            lsn,
+            self.lsn
+        );
+        if lsn > self.lsn {
+            self.pending_lsns.push(self.lsn);
+            self.lsn = lsn;
+        }
+        Ok(())
+    }
+
     /// Initialize a completely new repository.
     ///
     /// This inserts the directory metadata entries that are assumed to
@@ -984,11 +1012,9 @@ impl<'a> DatadirModification<'a> {
         dbnode: Oid,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        let req_lsn = self.tline.get_last_record_lsn();
-
         let total_blocks = self
             .tline
-            .get_db_size(spcnode, dbnode, req_lsn, true, ctx)
+            .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
             .await?;
 
         // Remove entry from dbdir
@@ -1077,8 +1103,11 @@ impl<'a> DatadirModification<'a> {
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
-        let last_lsn = self.tline.get_last_record_lsn();
-        if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
+        if self
+            .tline
+            .get_rel_exists(rel, Version::Modified(self), true, ctx)
+            .await?
+        {
             let size_key = rel_size_to_key(rel);
             // Fetch the old size first
             let old_size = self.get(size_key, ctx).await?.get_u32_le();
@@ -1323,17 +1352,23 @@ impl<'a> DatadirModification<'a> {
         let writer = self.tline.writer().await;
 
         // Flush relation and  SLRU data blocks, keep metadata.
-        let mut retained_pending_updates = HashMap::new();
-        for (key, value) in self.pending_updates.drain() {
-            if is_rel_block_key(&key) || is_slru_block_key(key) {
-                // This bails out on first error without modifying pending_updates.
-                // That's Ok, cf this function's doc comment.
-                writer.put(key, self.lsn, &value, ctx).await?;
-            } else {
-                retained_pending_updates.insert(key, value);
+        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
+        for (key, values) in self.pending_updates.drain() {
+            for (lsn, value) in values {
+                if is_rel_block_key(&key) || is_slru_block_key(key) {
+                    // This bails out on first error without modifying pending_updates.
+                    // That's Ok, cf this function's doc comment.
+                    writer.put(key, lsn, &value, ctx).await?;
+                } else {
+                    retained_pending_updates
+                        .entry(key)
+                        .or_default()
+                        .push((lsn, value));
+                }
             }
         }
-        self.pending_updates.extend(retained_pending_updates);
+
+        self.pending_updates = retained_pending_updates;
 
         if pending_nblocks != 0 {
             writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1350,18 +1385,28 @@ impl<'a> DatadirModification<'a> {
     ///
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
         let writer = self.tline.writer().await;
-        let lsn = self.lsn;
+
         let pending_nblocks = self.pending_nblocks;
         self.pending_nblocks = 0;
 
-        for (key, value) in self.pending_updates.drain() {
-            writer.put(key, lsn, &value, ctx).await?;
-        }
-        for key_range in self.pending_deletions.drain(..) {
-            writer.delete(key_range, lsn).await?;
+        if !self.pending_updates.is_empty() {
+            writer.put_batch(&self.pending_updates, ctx).await?;
+            self.pending_updates.clear();
         }
 
-        writer.finish_write(lsn);
+        if !self.pending_deletions.is_empty() {
+            writer.delete_batch(&self.pending_deletions).await?;
+            self.pending_deletions.clear();
+        }
+
+        self.pending_lsns.push(self.lsn);
+        for pending_lsn in self.pending_lsns.drain(..) {
+            // Ideally, we should be able to call writer.finish_write() only once
+            // with the highest LSN. However, the last_record_lsn variable in the
+            // timeline keeps track of the latest LSN and the immediate previous LSN
+            // so we need to record every LSN to not leave a gap between them.
+            writer.finish_write(pending_lsn);
+        }
 
         if pending_nblocks != 0 {
             writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1370,44 +1415,86 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
-    pub(crate) fn is_empty(&self) -> bool {
-        self.pending_updates.is_empty() && self.pending_deletions.is_empty()
+    pub(crate) fn len(&self) -> usize {
+        self.pending_updates.len() + self.pending_deletions.len()
     }
 
     // Internal helper functions to batch the modifications
 
     async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
-        // Have we already updated the same key? Read the pending updated
+        // Have we already updated the same key? Read the latest pending updated
         // version in that case.
         //
         // Note: we don't check pending_deletions. It is an error to request a
         // value that has been removed, deletion only avoids leaking storage.
-        if let Some(value) = self.pending_updates.get(&key) {
-            if let Value::Image(img) = value {
-                Ok(img.clone())
-            } else {
-                // Currently, we never need to read back a WAL record that we
-                // inserted in the same "transaction". All the metadata updates
-                // work directly with Images, and we never need to read actual
-                // data pages. We could handle this if we had to, by calling
-                // the walredo manager, but let's keep it simple for now.
-                Err(PageReconstructError::from(anyhow::anyhow!(
-                    "unexpected pending WAL record"
-                )))
+        if let Some(values) = self.pending_updates.get(&key) {
+            if let Some((_, value)) = values.last() {
+                return if let Value::Image(img) = value {
+                    Ok(img.clone())
+                } else {
+                    // Currently, we never need to read back a WAL record that we
+                    // inserted in the same "transaction". All the metadata updates
+                    // work directly with Images, and we never need to read actual
+                    // data pages. We could handle this if we had to, by calling
+                    // the walredo manager, but let's keep it simple for now.
+                    Err(PageReconstructError::from(anyhow::anyhow!(
+                        "unexpected pending WAL record"
+                    )))
+                };
             }
-        } else {
-            let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
-            self.tline.get(key, lsn, ctx).await
         }
+        let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
+        self.tline.get(key, lsn, ctx).await
     }
 
     fn put(&mut self, key: Key, val: Value) {
-        self.pending_updates.insert(key, val);
+        let values = self.pending_updates.entry(key).or_default();
+        // Replace the previous value if it exists at the same lsn
+        if let Some((last_lsn, last_value)) = values.last_mut() {
+            if *last_lsn == self.lsn {
+                *last_value = val;
+                return;
+            }
+        }
+        values.push((self.lsn, val));
     }
 
     fn delete(&mut self, key_range: Range<Key>) {
         trace!("DELETE {}-{}", key_range.start, key_range.end);
-        self.pending_deletions.push(key_range);
+        self.pending_deletions.push((key_range, self.lsn));
+    }
+}
+
+/// This struct facilitates accessing either a committed key from the timeline at a
+/// specific LSN, or the latest uncommitted key from a pending modification.
+/// During WAL ingestion, the records from multiple LSNs may be batched in the same
+/// modification before being flushed to the timeline. Hence, the routines in WalIngest
+/// need to look up the keys in the modification first before looking them up in the
+/// timeline to not miss the latest updates.
+#[derive(Clone, Copy)]
+pub enum Version<'a> {
+    Lsn(Lsn),
+    Modified(&'a DatadirModification<'a>),
+}
+
+impl<'a> Version<'a> {
+    async fn get(
+        &self,
+        timeline: &Timeline,
+        key: Key,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        match self {
+            Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
+            Version::Modified(modification) => modification.get(key, ctx).await,
+        }
+    }
+
+    fn get_lsn(&self) -> Lsn {
+        match self {
+            Version::Lsn(lsn) => *lsn,
+            Version::Modified(modification) => modification.lsn,
+        }
     }
 }
 
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index cb1b2b8011..5a06a97525 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -147,7 +147,7 @@ pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(||
     // else, but that has not been needed in a long time.
     std::env::var("TOKIO_WORKER_THREADS")
         .map(|s| s.parse::<usize>().unwrap())
-        .unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
+        .unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
 });
 
 #[derive(Debug, Clone, Copy)]
@@ -258,6 +258,9 @@ pub enum TaskKind {
     /// See [`crate::disk_usage_eviction_task`].
     DiskUsageEviction,
 
+    /// See [`crate::tenant::secondary`].
+    SecondaryDownloads,
+
     /// See [`crate::tenant::secondary`].
     SecondaryUploads,
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e50987c84b..7c609452e5 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -56,6 +56,7 @@ use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
+use self::timeline::WaitLsnError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
@@ -595,10 +596,9 @@ impl Tenant {
         mode: SpawnMode,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Tenant>> {
-        // TODO(sharding): make WalRedoManager shard-aware
         let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
             conf,
-            tenant_shard_id.tenant_id,
+            tenant_shard_id,
         )));
 
         let TenantSharedResources {
@@ -1145,10 +1145,9 @@ impl Tenant {
         tenant_shard_id: TenantShardId,
         reason: String,
     ) -> Arc<Tenant> {
-        // TODO(sharding): make WalRedoManager shard-aware
         let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
             conf,
-            tenant_shard_id.tenant_id,
+            tenant_shard_id,
         )));
         Arc::new(Tenant::new(
             TenantState::Broken {
@@ -1760,7 +1759,15 @@ impl Tenant {
                     // decoding the new WAL might need to look up previous pages, relation
                     // sizes etc. and that would get confused if the previous page versions
                     // are not in the repository yet.
-                    ancestor_timeline.wait_lsn(*lsn, ctx).await?;
+                    ancestor_timeline
+                        .wait_lsn(*lsn, ctx)
+                        .await
+                        .map_err(|e| match e {
+                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
+                                CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
+                            }
+                            WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
+                        })?;
                 }
 
                 self.branch_timeline(
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 25d97f51ce..2d4cd350d7 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -46,6 +46,8 @@ pub mod defaults {
     pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
     pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
     pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
+
+    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index b21bad51ba..2f606ed822 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -588,7 +588,7 @@ impl DeleteTenantFlow {
                             }
                             break;
                         }
-                        TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
+                        TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
                             // This is unexpected: this secondary tenants should not have been created, and we
                             // are not in a position to shut it down from here.
                             tracing::warn!("Tenant transitioned to secondary mode while deleting!");
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 62922e8c99..70b41b7b1f 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -44,6 +44,7 @@ use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};
 
 use super::delete::DeleteTenantError;
+use super::secondary::SecondaryTenant;
 use super::TenantSharedResources;
 
 /// For a tenant that appears in TenantsMap, it may either be
@@ -57,7 +58,7 @@ use super::TenantSharedResources;
 /// having a properly acquired generation (Secondary doesn't need a generation)
 pub(crate) enum TenantSlot {
     Attached(Arc<Tenant>),
-    Secondary,
+    Secondary(Arc<SecondaryTenant>),
     /// In this state, other administrative operations acting on the TenantId should
     /// block, or return a retry indicator equivalent to HTTP 503.
     InProgress(utils::completion::Barrier),
@@ -67,7 +68,7 @@ impl std::fmt::Debug for TenantSlot {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
             Self::Attached(tenant) => write!(f, "Attached({})", tenant.current_state()),
-            Self::Secondary => write!(f, "Secondary"),
+            Self::Secondary(_) => write!(f, "Secondary"),
             Self::InProgress(_) => write!(f, "InProgress"),
         }
     }
@@ -78,7 +79,7 @@ impl TenantSlot {
     fn get_attached(&self) -> Option<&Arc<Tenant>> {
         match self {
             Self::Attached(t) => Some(t),
-            Self::Secondary => None,
+            Self::Secondary(_) => None,
             Self::InProgress(_) => None,
         }
     }
@@ -130,7 +131,7 @@ impl TenantsMap {
 
     /// A page service client sends a TenantId, and to look up the correct Tenant we must
     /// resolve this to a fully qualified TenantShardId.
-    fn resolve_shard(
+    fn resolve_attached_shard(
         &self,
         tenant_id: &TenantId,
         selector: ShardSelector,
@@ -140,25 +141,27 @@ impl TenantsMap {
             TenantsMap::Initializing => None,
             TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
                 for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
+                    // Ignore all slots that don't contain an attached tenant
+                    let tenant = match &slot.1 {
+                        TenantSlot::Attached(t) => t,
+                        _ => continue,
+                    };
+
                     match selector {
                         ShardSelector::First => return Some(*slot.0),
                         ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
                             return Some(*slot.0)
                         }
                         ShardSelector::Page(key) => {
-                            if let Some(tenant) = slot.1.get_attached() {
-                                // First slot we see for this tenant, calculate the expected shard number
-                                // for the key: we will use this for checking if this and subsequent
-                                // slots contain the key, rather than recalculating the hash each time.
-                                if want_shard.is_none() {
-                                    want_shard = Some(tenant.shard_identity.get_shard_number(&key));
-                                }
+                            // First slot we see for this tenant, calculate the expected shard number
+                            // for the key: we will use this for checking if this and subsequent
+                            // slots contain the key, rather than recalculating the hash each time.
+                            if want_shard.is_none() {
+                                want_shard = Some(tenant.shard_identity.get_shard_number(&key));
+                            }
 
-                                if Some(tenant.shard_identity.number) == want_shard {
-                                    return Some(*slot.0);
-                                }
-                            } else {
-                                continue;
+                            if Some(tenant.shard_identity.number) == want_shard {
+                                return Some(*slot.0);
                             }
                         }
                         _ => continue,
@@ -464,12 +467,18 @@ pub async fn init_tenant_mgr(
                 *gen
             } else {
                 match &location_conf.mode {
-                    LocationMode::Secondary(_) => {
+                    LocationMode::Secondary(secondary_config) => {
                         // We do not require the control plane's permission for secondary mode
                         // tenants, because they do no remote writes and hence require no
                         // generation number
                         info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
-                        tenants.insert(tenant_shard_id, TenantSlot::Secondary);
+                        tenants.insert(
+                            tenant_shard_id,
+                            TenantSlot::Secondary(SecondaryTenant::new(
+                                tenant_shard_id,
+                                secondary_config,
+                            )),
+                        );
                     }
                     LocationMode::Attached(_) => {
                         // TODO: augment re-attach API to enable the control plane to
@@ -661,8 +670,14 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
 
                             total_attached += 1;
                         }
-                        TenantSlot::Secondary => {
-                            shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary);
+                        TenantSlot::Secondary(state) => {
+                            // We don't need to wait for this individually per-tenant: the
+                            // downloader task will be waited on eventually, this cancel
+                            // is just to encourage it to drop out if it is doing work
+                            // for this tenant right now.
+                            state.cancel.cancel();
+
+                            shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary(state));
                         }
                         TenantSlot::InProgress(notify) => {
                             // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
@@ -845,12 +860,28 @@ impl TenantManager {
             Some(TenantSlot::InProgress(_)) => {
                 Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
             }
-            None | Some(TenantSlot::Secondary) => {
+            None | Some(TenantSlot::Secondary(_)) => {
                 Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
             }
         }
     }
 
+    pub(crate) fn get_secondary_tenant_shard(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Option<Arc<SecondaryTenant>> {
+        let locked = self.tenants.read().unwrap();
+
+        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
+            .ok()
+            .flatten();
+
+        match peek_slot {
+            Some(TenantSlot::Secondary(s)) => Some(s.clone()),
+            _ => None,
+        }
+    }
+
     #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
     pub(crate) async fn upsert_location(
         &self,
@@ -862,10 +893,15 @@ impl TenantManager {
         debug_assert_current_span_has_tenant_id();
         info!("configuring tenant location to state {new_location_config:?}");
 
-        // Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
+        enum FastPathModified {
+            Attached(Arc<Tenant>),
+            Secondary(Arc<SecondaryTenant>),
+        }
+
+        // Special case fast-path for updates to existing slots: if our upsert is only updating configuration,
         // then we do not need to set the slot to InProgress, we can just call into the
         // existng tenant.
-        let modify_tenant = {
+        let fast_path_taken = {
             let locked = self.tenants.read().unwrap();
             let peek_slot =
                 tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
@@ -879,12 +915,19 @@ impl TenantManager {
                             new_location_config.clone(),
                         )?);
 
-                        Some(tenant.clone())
+                        Some(FastPathModified::Attached(tenant.clone()))
                     } else {
                         // Different generations, fall through to general case
                         None
                     }
                 }
+                (
+                    LocationMode::Secondary(secondary_conf),
+                    Some(TenantSlot::Secondary(secondary_tenant)),
+                ) => {
+                    secondary_tenant.set_config(secondary_conf);
+                    Some(FastPathModified::Secondary(secondary_tenant.clone()))
+                }
                 _ => {
                     // Not an Attached->Attached transition, fall through to general case
                     None
@@ -893,34 +936,51 @@ impl TenantManager {
         };
 
         // Fast-path continued: having dropped out of the self.tenants lock, do the async
-        // phase of waiting for flush, before returning.
-        if let Some(tenant) = modify_tenant {
-            // Transition to AttachedStale means we may well hold a valid generation
-            // still, and have been requested to go stale as part of a migration.  If
-            // the caller set `flush`, then flush to remote storage.
-            if let LocationMode::Attached(AttachedLocationConfig {
-                generation: _,
-                attach_mode: AttachmentMode::Stale,
-            }) = &new_location_config.mode
-            {
-                if let Some(flush_timeout) = flush {
-                    match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
-                        Ok(Err(e)) => {
-                            return Err(e);
-                        }
-                        Ok(Ok(_)) => return Ok(()),
-                        Err(_) => {
-                            tracing::warn!(
+        // phase of writing config and/or waiting for flush, before returning.
+        match fast_path_taken {
+            Some(FastPathModified::Attached(tenant)) => {
+                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+                    .await
+                    .map_err(SetNewTenantConfigError::Persist)?;
+
+                // Transition to AttachedStale means we may well hold a valid generation
+                // still, and have been requested to go stale as part of a migration.  If
+                // the caller set `flush`, then flush to remote storage.
+                if let LocationMode::Attached(AttachedLocationConfig {
+                    generation: _,
+                    attach_mode: AttachmentMode::Stale,
+                }) = &new_location_config.mode
+                {
+                    if let Some(flush_timeout) = flush {
+                        match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
+                            Ok(Err(e)) => {
+                                return Err(e);
+                            }
+                            Ok(Ok(_)) => return Ok(()),
+                            Err(_) => {
+                                tracing::warn!(
                                 timeout_ms = flush_timeout.as_millis(),
                                 "Timed out waiting for flush to remote storage, proceeding anyway."
                             )
+                            }
                         }
                     }
                 }
-            }
 
-            return Ok(());
-        }
+                return Ok(());
+            }
+            Some(FastPathModified::Secondary(_secondary_tenant)) => {
+                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+                    .await
+                    .map_err(SetNewTenantConfigError::Persist)?;
+
+                return Ok(());
+            }
+            None => {
+                // Proceed with the general case procedure, where we will shutdown & remove any existing
+                // slot contents and replace with a fresh one
+            }
+        };
 
         // General case for upserts to TenantsMap, excluding the case above: we will substitute an
         // InProgress value to the slot while we make whatever changes are required.  The state for
@@ -929,33 +989,47 @@ impl TenantManager {
         // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
         let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
 
-        if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
-            // The case where we keep a Tenant alive was covered above in the special case
-            // for Attached->Attached transitions in the same generation.  By this point,
-            // if we see an attached tenant we know it will be discarded and should be
-            // shut down.
-            let (_guard, progress) = utils::completion::channel();
+        match slot_guard.get_old_value() {
+            Some(TenantSlot::Attached(tenant)) => {
+                // The case where we keep a Tenant alive was covered above in the special case
+                // for Attached->Attached transitions in the same generation.  By this point,
+                // if we see an attached tenant we know it will be discarded and should be
+                // shut down.
+                let (_guard, progress) = utils::completion::channel();
 
-            match tenant.get_attach_mode() {
-                AttachmentMode::Single | AttachmentMode::Multi => {
-                    // Before we leave our state as the presumed holder of the latest generation,
-                    // flush any outstanding deletions to reduce the risk of leaking objects.
-                    self.resources.deletion_queue_client.flush_advisory()
-                }
-                AttachmentMode::Stale => {
-                    // If we're stale there's not point trying to flush deletions
-                }
-            };
+                match tenant.get_attach_mode() {
+                    AttachmentMode::Single | AttachmentMode::Multi => {
+                        // Before we leave our state as the presumed holder of the latest generation,
+                        // flush any outstanding deletions to reduce the risk of leaking objects.
+                        self.resources.deletion_queue_client.flush_advisory()
+                    }
+                    AttachmentMode::Stale => {
+                        // If we're stale there's not point trying to flush deletions
+                    }
+                };
 
-            info!("Shutting down attached tenant");
-            match tenant.shutdown(progress, false).await {
-                Ok(()) => {}
-                Err(barrier) => {
-                    info!("Shutdown already in progress, waiting for it to complete");
-                    barrier.wait().await;
+                info!("Shutting down attached tenant");
+                match tenant.shutdown(progress, false).await {
+                    Ok(()) => {}
+                    Err(barrier) => {
+                        info!("Shutdown already in progress, waiting for it to complete");
+                        barrier.wait().await;
+                    }
                 }
+                slot_guard.drop_old_value().expect("We just shut it down");
+            }
+            Some(TenantSlot::Secondary(state)) => {
+                info!("Shutting down secondary tenant");
+                state.shutdown().await;
+            }
+            Some(TenantSlot::InProgress(_)) => {
+                // This should never happen: acquire_slot should error out
+                // if the contents of a slot were InProgress.
+                anyhow::bail!("Acquired an InProgress slot, this is a bug.")
+            }
+            None => {
+                // Slot was vacant, nothing needs shutting down.
             }
-            slot_guard.drop_old_value().expect("We just shut it down");
         }
 
         let tenant_path = self.conf.tenant_path(&tenant_shard_id);
@@ -978,7 +1052,9 @@ impl TenantManager {
             .map_err(SetNewTenantConfigError::Persist)?;
 
         let new_slot = match &new_location_config.mode {
-            LocationMode::Secondary(_) => TenantSlot::Secondary,
+            LocationMode::Secondary(secondary_config) => {
+                TenantSlot::Secondary(SecondaryTenant::new(tenant_shard_id, secondary_config))
+            }
             LocationMode::Attached(_attach_config) => {
                 let shard_identity = new_location_config.shard;
                 let tenant = tenant_spawn(
@@ -1091,6 +1167,30 @@ impl TenantManager {
                 .collect(),
         }
     }
+    // Do some synchronous work for all tenant slots in Secondary state.  The provided
+    // callback should be small and fast, as it will be called inside the global
+    // TenantsMap lock.
+    pub(crate) fn foreach_secondary_tenants<F>(&self, mut func: F)
+    where
+        // TODO: let the callback return a hint to drop out of the loop early
+        F: FnMut(&TenantShardId, &Arc<SecondaryTenant>),
+    {
+        let locked = self.tenants.read().unwrap();
+
+        let map = match &*locked {
+            TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return,
+            TenantsMap::Open(m) => m,
+        };
+
+        for (tenant_id, slot) in map {
+            if let TenantSlot::Secondary(state) = slot {
+                // Only expose secondary tenants that are not currently shutting down
+                if !state.cancel.is_cancelled() {
+                    func(tenant_id, state)
+                }
+            }
+        }
+    }
 
     pub(crate) async fn delete_tenant(
         &self,
@@ -1205,7 +1305,7 @@ pub(crate) fn get_tenant(
         Some(TenantSlot::InProgress(_)) => {
             Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
         }
-        None | Some(TenantSlot::Secondary) => {
+        None | Some(TenantSlot::Secondary(_)) => {
             Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
         }
     }
@@ -1257,9 +1357,11 @@ pub(crate) async fn get_active_tenant_with_timeout(
         let locked = TENANTS.read().unwrap();
 
         // Resolve TenantId to TenantShardId
-        let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
-            GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
-        )?;
+        let tenant_shard_id = locked
+            .resolve_attached_shard(&tenant_id, shard_selector)
+            .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
+                tenant_id,
+            )))?;
 
         let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
             .map_err(GetTenantError::MapState)?;
@@ -1276,7 +1378,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                     }
                 }
             }
-            Some(TenantSlot::Secondary) => {
+            Some(TenantSlot::Secondary(_)) => {
                 return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
                     tenant_id,
                 )))
@@ -1540,7 +1642,7 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>,
     Ok(m.iter()
         .filter_map(|(id, tenant)| match tenant {
             TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
-            TenantSlot::Secondary => None,
+            TenantSlot::Secondary(_) => None,
             TenantSlot::InProgress(_) => None,
         })
         .collect())
@@ -1797,11 +1899,7 @@ impl SlotGuard {
     fn old_value_is_shutdown(&self) -> bool {
         match self.old_value.as_ref() {
             Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
-            Some(TenantSlot::Secondary) => {
-                // TODO: when adding secondary mode tenants, this will check for shutdown
-                // in the same way that we do for `Tenant` above
-                true
-            }
+            Some(TenantSlot::Secondary(secondary_tenant)) => secondary_tenant.gate.close_complete(),
             Some(TenantSlot::InProgress(_)) => {
                 // A SlotGuard cannot be constructed for a slot that was already InProgress
                 unreachable!()
@@ -2011,26 +2109,19 @@ where
     let mut slot_guard =
         tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;
 
-    // The SlotGuard allows us to manipulate the Tenant object without fear of some
-    // concurrent API request doing something else for the same tenant ID.
-    let attached_tenant = match slot_guard.get_old_value() {
-        Some(TenantSlot::Attached(t)) => Some(t),
-        _ => None,
-    };
-
     // allow pageserver shutdown to await for our completion
     let (_guard, progress) = completion::channel();
 
-    // If the tenant was attached, shut it down gracefully.  For secondary
-    // locations this part is not necessary
-    match &attached_tenant {
-        Some(attached_tenant) => {
+    // The SlotGuard allows us to manipulate the Tenant object without fear of some
+    // concurrent API request doing something else for the same tenant ID.
+    let attached_tenant = match slot_guard.get_old_value() {
+        Some(TenantSlot::Attached(tenant)) => {
             // whenever we remove a tenant from memory, we don't want to flush and wait for upload
             let freeze_and_flush = false;
 
             // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
             // that we can continue safely to cleanup.
-            match attached_tenant.shutdown(progress, freeze_and_flush).await {
+            match tenant.shutdown(progress, freeze_and_flush).await {
                 Ok(()) => {}
                 Err(_other) => {
                     // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
@@ -2039,11 +2130,19 @@ where
                     return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id));
                 }
             }
+            Some(tenant)
         }
-        None => {
-            // Nothing to wait on when not attached, proceed.
+        Some(TenantSlot::Secondary(secondary_state)) => {
+            tracing::info!("Shutting down in secondary mode");
+            secondary_state.shutdown().await;
+            None
         }
-    }
+        Some(TenantSlot::InProgress(_)) => {
+            // Acquiring a slot guarantees its old value was not InProgress
+            unreachable!();
+        }
+        None => None,
+    };
 
     match tenant_cleanup
         .await
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1b0cf39fbe..2ea3ced008 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -229,6 +229,7 @@ use crate::{
     tenant::upload_queue::{
         UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
     },
+    TENANT_HEATMAP_BASENAME,
 };
 
 use utils::id::{TenantId, TimelineId};
@@ -818,8 +819,25 @@ impl RemoteTimelineClient {
     fn schedule_deletion_of_unlinked0(
         self: &Arc<Self>,
         upload_queue: &mut UploadQueueInitialized,
-        with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
+        mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
     ) {
+        // Filter out any layers which were not created by this tenant shard.  These are
+        // layers that originate from some ancestor shard after a split, and may still
+        // be referenced by other shards. We are free to delete them locally and remove
+        // them from our index (and would have already done so when we reach this point
+        // in the code), but we may not delete them remotely.
+        with_metadata.retain(|(name, meta)| {
+            let retain = meta.shard.shard_number == self.tenant_shard_id.shard_number
+                && meta.shard.shard_count == self.tenant_shard_id.shard_count;
+            if !retain {
+                tracing::debug!(
+                    "Skipping deletion of ancestor-shard layer {name}, from shard {}",
+                    meta.shard
+                );
+            }
+            retain
+        });
+
         for (name, meta) in &with_metadata {
             info!(
                 "scheduling deletion of layer {}{} (shard {})",
@@ -1724,11 +1742,11 @@ pub fn remote_index_path(
     .expect("Failed to construct path")
 }
 
-pub const HEATMAP_BASENAME: &str = "heatmap-v1.json";
-
 pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath {
-    RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}"))
-        .expect("Failed to construct path")
+    RemotePath::from_string(&format!(
+        "tenants/{tenant_shard_id}/{TENANT_HEATMAP_BASENAME}"
+    ))
+    .expect("Failed to construct path")
 }
 
 /// Given the key of an index, parse out the generation part of the name
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index d25fe56b92..2331447266 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -1,24 +1,48 @@
+mod downloader;
 pub mod heatmap;
 mod heatmap_uploader;
+mod scheduler;
 
 use std::sync::Arc;
 
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 
-use self::heatmap_uploader::heatmap_uploader_task;
+use self::{
+    downloader::{downloader_task, SecondaryDetail},
+    heatmap_uploader::heatmap_uploader_task,
+};
 
-use super::mgr::TenantManager;
+use super::{config::SecondaryLocationConfig, mgr::TenantManager};
 
 use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 
 use tokio_util::sync::CancellationToken;
-use utils::completion::Barrier;
+use utils::{completion::Barrier, sync::gate::Gate};
 
+enum DownloadCommand {
+    Download(TenantShardId),
+}
 enum UploadCommand {
     Upload(TenantShardId),
 }
 
+impl UploadCommand {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        match self {
+            Self::Upload(id) => id,
+        }
+    }
+}
+
+impl DownloadCommand {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        match self {
+            Self::Download(id) => id,
+        }
+    }
+}
+
 struct CommandRequest<T> {
     payload: T,
     response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
@@ -28,12 +52,73 @@ struct CommandResponse {
     result: anyhow::Result<()>,
 }
 
+// Whereas [`Tenant`] represents an attached tenant, this type represents the work
+// we do for secondary tenant locations: where we are not serving clients or
+// ingesting WAL, but we are maintaining a warm cache of layer files.
+//
+// This type is all about the _download_ path for secondary mode.  The upload path
+// runs separately (see [`heatmap_uploader`]) while a regular attached `Tenant` exists.
+//
+// This structure coordinates TenantManager and SecondaryDownloader,
+// so that the downloader can indicate which tenants it is currently
+// operating on, and the manager can indicate when a particular
+// secondary tenant should cancel any work in flight.
+#[derive(Debug)]
+pub(crate) struct SecondaryTenant {
+    /// Carrying a tenant shard ID simplifies callers such as the downloader
+    /// which need to organize many of these objects by ID.
+    tenant_shard_id: TenantShardId,
+
+    /// Cancellation token indicates to SecondaryDownloader that it should stop doing
+    /// any work for this tenant at the next opportunity.
+    pub(crate) cancel: CancellationToken,
+
+    pub(crate) gate: Gate,
+
+    detail: std::sync::Mutex<SecondaryDetail>,
+}
+
+impl SecondaryTenant {
+    pub(crate) fn new(
+        tenant_shard_id: TenantShardId,
+        config: &SecondaryLocationConfig,
+    ) -> Arc<Self> {
+        Arc::new(Self {
+            tenant_shard_id,
+            // todo: shall we make this a descendent of the
+            // main cancellation token, or is it sufficient that
+            // on shutdown we walk the tenants and fire their
+            // individual cancellations?
+            cancel: CancellationToken::new(),
+            gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")),
+
+            detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
+        })
+    }
+
+    pub(crate) async fn shutdown(&self) {
+        self.cancel.cancel();
+
+        // Wait for any secondary downloader work to complete
+        self.gate.close().await;
+    }
+
+    pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
+        self.detail.lock().unwrap().config = config.clone();
+    }
+
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        &self.tenant_shard_id
+    }
+}
+
 /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
 /// and heatmap uploads.  This is not a hot data path: it's primarily a hook for tests,
 /// where we want to immediately upload/download for a particular tenant.  In normal operation
 /// uploads & downloads are autonomous and not driven by this interface.
 pub struct SecondaryController {
     upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
+    download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
 }
 
 impl SecondaryController {
@@ -63,6 +148,13 @@ impl SecondaryController {
         self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
             .await
     }
+    pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+        self.dispatch(
+            &self.download_req_tx,
+            DownloadCommand::Download(tenant_shard_id),
+        )
+        .await
+    }
 }
 
 pub fn spawn_tasks(
@@ -71,9 +163,37 @@ pub fn spawn_tasks(
     background_jobs_can_start: Barrier,
     cancel: CancellationToken,
 ) -> SecondaryController {
+    let mgr_clone = tenant_manager.clone();
+    let storage_clone = remote_storage.clone();
+    let cancel_clone = cancel.clone();
+    let bg_jobs_clone = background_jobs_can_start.clone();
+
+    let (download_req_tx, download_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
     let (upload_req_tx, upload_req_rx) =
         tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
 
+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::SecondaryDownloads,
+        None,
+        None,
+        "secondary tenant downloads",
+        false,
+        async move {
+            downloader_task(
+                mgr_clone,
+                storage_clone,
+                download_req_rx,
+                bg_jobs_clone,
+                cancel_clone,
+            )
+            .await;
+
+            Ok(())
+        },
+    );
+
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
         TaskKind::SecondaryUploads,
@@ -89,16 +209,26 @@ pub fn spawn_tasks(
                 background_jobs_can_start,
                 cancel,
             )
-            .await
+            .await;
+
+            Ok(())
         },
     );
 
-    SecondaryController { upload_req_tx }
+    SecondaryController {
+        download_req_tx,
+        upload_req_tx,
+    }
 }
 
 /// For running with remote storage disabled: a SecondaryController that is connected to nothing.
 pub fn null_controller() -> SecondaryController {
+    let (download_req_tx, _download_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
     let (upload_req_tx, _upload_req_rx) =
         tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
-    SecondaryController { upload_req_tx }
+    SecondaryController {
+        upload_req_tx,
+        download_req_tx,
+    }
 }
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
new file mode 100644
index 0000000000..6fdee08a4e
--- /dev/null
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -0,0 +1,801 @@
+use std::{
+    collections::{HashMap, HashSet},
+    pin::Pin,
+    str::FromStr,
+    sync::Arc,
+    time::{Duration, Instant, SystemTime},
+};
+
+use crate::{
+    config::PageServerConf,
+    metrics::SECONDARY_MODE,
+    tenant::{
+        config::SecondaryLocationConfig,
+        debug_assert_current_span_has_tenant_and_timeline_id,
+        remote_timeline_client::{
+            index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
+        },
+        span::debug_assert_current_span_has_tenant_id,
+        storage_layer::LayerFileName,
+        tasks::{warn_when_period_overrun, BackgroundLoopKind},
+    },
+    virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
+    METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
+};
+
+use super::{
+    heatmap::HeatMapLayer,
+    scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs},
+    SecondaryTenant,
+};
+
+use crate::tenant::{
+    mgr::TenantManager,
+    remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
+};
+
+use chrono::format::{DelayedFormat, StrftimeItems};
+use futures::Future;
+use pageserver_api::shard::TenantShardId;
+use rand::Rng;
+use remote_storage::{DownloadError, GenericRemoteStorage};
+
+use tokio_util::sync::CancellationToken;
+use tracing::{info_span, instrument, Instrument};
+use utils::{
+    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
+};
+
+use super::{
+    heatmap::{HeatMapTenant, HeatMapTimeline},
+    CommandRequest, DownloadCommand,
+};
+
+/// For each tenant, how long must have passed since the last download_tenant call before
+/// calling it again.  This is approximately the time by which local data is allowed
+/// to fall behind remote data.
+///
+/// TODO: this should just be a default, and the actual period should be controlled
+/// via the heatmap itself
+/// `<ttps://github.com/neondatabase/neon/issues/6200>`
+const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
+
+pub(super) async fn downloader_task(
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
+    background_jobs_can_start: Barrier,
+    cancel: CancellationToken,
+) {
+    let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
+
+    let generator = SecondaryDownloader {
+        tenant_manager,
+        remote_storage,
+    };
+    let mut scheduler = Scheduler::new(generator, concurrency);
+
+    scheduler
+        .run(command_queue, background_jobs_can_start, cancel)
+        .instrument(info_span!("secondary_downloads"))
+        .await
+}
+
+struct SecondaryDownloader {
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+}
+
+#[derive(Debug, Clone)]
+pub(super) struct OnDiskState {
+    metadata: LayerFileMetadata,
+    access_time: SystemTime,
+}
+
+impl OnDiskState {
+    fn new(
+        _conf: &'static PageServerConf,
+        _tenant_shard_id: &TenantShardId,
+        _imeline_id: &TimelineId,
+        _ame: LayerFileName,
+        metadata: LayerFileMetadata,
+        access_time: SystemTime,
+    ) -> Self {
+        Self {
+            metadata,
+            access_time,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default)]
+pub(super) struct SecondaryDetailTimeline {
+    pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
+
+    /// We remember when layers were evicted, to prevent re-downloading them.
+    pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
+}
+
+/// This state is written by the secondary downloader, it is opaque
+/// to TenantManager
+#[derive(Debug)]
+pub(super) struct SecondaryDetail {
+    pub(super) config: SecondaryLocationConfig,
+
+    last_download: Option<Instant>,
+    next_download: Option<Instant>,
+    pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
+}
+
+/// Helper for logging SystemTime
+fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
+    let datetime: chrono::DateTime<chrono::Utc> = (*t).into();
+    datetime.format("%d/%m/%Y %T")
+}
+
+impl SecondaryDetail {
+    pub(super) fn new(config: SecondaryLocationConfig) -> Self {
+        Self {
+            config,
+            last_download: None,
+            next_download: None,
+            timelines: HashMap::new(),
+        }
+    }
+}
+
+struct PendingDownload {
+    secondary_state: Arc<SecondaryTenant>,
+    last_download: Option<Instant>,
+    target_time: Option<Instant>,
+    period: Option<Duration>,
+}
+
+impl scheduler::PendingJob for PendingDownload {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        self.secondary_state.get_tenant_shard_id()
+    }
+}
+
+struct RunningDownload {
+    barrier: Barrier,
+}
+
+impl scheduler::RunningJob for RunningDownload {
+    fn get_barrier(&self) -> Barrier {
+        self.barrier.clone()
+    }
+}
+
+struct CompleteDownload {
+    secondary_state: Arc<SecondaryTenant>,
+    completed_at: Instant,
+}
+
+impl scheduler::Completion for CompleteDownload {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        self.secondary_state.get_tenant_shard_id()
+    }
+}
+
+type Scheduler = TenantBackgroundJobs<
+    SecondaryDownloader,
+    PendingDownload,
+    RunningDownload,
+    CompleteDownload,
+    DownloadCommand,
+>;
+
+#[async_trait::async_trait]
+impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCommand>
+    for SecondaryDownloader
+{
+    #[instrument(skip_all, fields(tenant_id=%completion.get_tenant_shard_id().tenant_id, shard_id=%completion.get_tenant_shard_id().shard_slug()))]
+    fn on_completion(&mut self, completion: CompleteDownload) {
+        let CompleteDownload {
+            secondary_state,
+            completed_at: _completed_at,
+        } = completion;
+
+        tracing::debug!("Secondary tenant download completed");
+
+        // Update freshened_at even if there was an error: we don't want errored tenants to implicitly
+        // take priority to run again.
+        let mut detail = secondary_state.detail.lock().unwrap();
+        detail.next_download = Some(Instant::now() + DOWNLOAD_FRESHEN_INTERVAL);
+    }
+
+    async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
+        let mut result = SchedulingResult {
+            jobs: Vec::new(),
+            want_interval: None,
+        };
+
+        // Step 1: identify some tenants that we may work on
+        let mut tenants: Vec<Arc<SecondaryTenant>> = Vec::new();
+        self.tenant_manager
+            .foreach_secondary_tenants(|_id, secondary_state| {
+                tenants.push(secondary_state.clone());
+            });
+
+        // Step 2: filter out tenants which are not yet elegible to run
+        let now = Instant::now();
+        result.jobs = tenants
+            .into_iter()
+            .filter_map(|secondary_tenant| {
+                let (last_download, next_download) = {
+                    let mut detail = secondary_tenant.detail.lock().unwrap();
+
+                    if !detail.config.warm {
+                        // Downloads are disabled for this tenant
+                        detail.next_download = None;
+                        return None;
+                    }
+
+                    if detail.next_download.is_none() {
+                        // Initialize with a jitter: this spreads initial downloads on startup
+                        // or mass-attach across our freshen interval.
+                        let jittered_period =
+                            rand::thread_rng().gen_range(Duration::ZERO..DOWNLOAD_FRESHEN_INTERVAL);
+                        detail.next_download = Some(now.checked_add(jittered_period).expect(
+                        "Using our constant, which is known to be small compared with clock range",
+                    ));
+                    }
+                    (detail.last_download, detail.next_download.unwrap())
+                };
+
+                if now < next_download {
+                    Some(PendingDownload {
+                        secondary_state: secondary_tenant,
+                        last_download,
+                        target_time: Some(next_download),
+                        period: Some(DOWNLOAD_FRESHEN_INTERVAL),
+                    })
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        // Step 3: sort by target execution time to run most urgent first.
+        result.jobs.sort_by_key(|j| j.target_time);
+
+        result
+    }
+
+    fn on_command(&mut self, command: DownloadCommand) -> anyhow::Result<PendingDownload> {
+        let tenant_shard_id = command.get_tenant_shard_id();
+
+        let tenant = self
+            .tenant_manager
+            .get_secondary_tenant_shard(*tenant_shard_id);
+        let Some(tenant) = tenant else {
+            {
+                return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
+            }
+        };
+
+        Ok(PendingDownload {
+            target_time: None,
+            period: None,
+            last_download: None,
+            secondary_state: tenant,
+        })
+    }
+
+    fn spawn(
+        &mut self,
+        job: PendingDownload,
+    ) -> (
+        RunningDownload,
+        Pin<Box<dyn Future<Output = CompleteDownload> + Send>>,
+    ) {
+        let PendingDownload {
+            secondary_state,
+            last_download,
+            target_time,
+            period,
+        } = job;
+
+        let (completion, barrier) = utils::completion::channel();
+        let remote_storage = self.remote_storage.clone();
+        let conf = self.tenant_manager.get_conf();
+        let tenant_shard_id = *secondary_state.get_tenant_shard_id();
+        (RunningDownload { barrier }, Box::pin(async move {
+            let _completion = completion;
+
+            match TenantDownloader::new(conf, &remote_storage, &secondary_state)
+                .download()
+                .await
+            {
+                Err(UpdateError::NoData) => {
+                    tracing::info!("No heatmap found for tenant.  This is fine if it is new.");
+                },
+                Err(UpdateError::NoSpace) => {
+                    tracing::warn!("Insufficient space while downloading.  Will retry later.");
+                }
+                Err(UpdateError::Cancelled) => {
+                    tracing::debug!("Shut down while downloading");
+                },
+                Err(UpdateError::Deserialize(e)) => {
+                    tracing::error!("Corrupt content while downloading tenant: {e}");
+                },
+                Err(e @ (UpdateError::DownloadError(_) | UpdateError::Other(_))) => {
+                    tracing::error!("Error while downloading tenant: {e}");
+                },
+                Ok(()) => {}
+            };
+
+            // Irrespective of the result, we will reschedule ourselves to run after our usual period.
+
+            // If the job had a target execution time, we may check our final execution
+            // time against that for observability purposes.
+            if let (Some(target_time), Some(period)) = (target_time, period) {
+                // Only track execution lag if this isn't our first download: otherwise, it is expected
+                // that execution will have taken longer than our configured interval, for example
+                // when starting up a pageserver and
+                if last_download.is_some() {
+                    // Elapsed time includes any scheduling lag as well as the execution of the job
+                    let elapsed = Instant::now().duration_since(target_time);
+
+                    warn_when_period_overrun(
+                        elapsed,
+                        period,
+                        BackgroundLoopKind::SecondaryDownload,
+                    );
+                }
+            }
+
+            CompleteDownload {
+                    secondary_state,
+                    completed_at: Instant::now(),
+                }
+        }.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
+    }
+}
+
+/// This type is a convenience to group together the various functions involved in
+/// freshening a secondary tenant.
+struct TenantDownloader<'a> {
+    conf: &'static PageServerConf,
+    remote_storage: &'a GenericRemoteStorage,
+    secondary_state: &'a SecondaryTenant,
+}
+
+/// Errors that may be encountered while updating a tenant
+#[derive(thiserror::Error, Debug)]
+enum UpdateError {
+    #[error("No remote data found")]
+    NoData,
+    #[error("Insufficient local storage space")]
+    NoSpace,
+    #[error("Failed to download")]
+    DownloadError(DownloadError),
+    #[error(transparent)]
+    Deserialize(#[from] serde_json::Error),
+    #[error("Cancelled")]
+    Cancelled,
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl From<DownloadError> for UpdateError {
+    fn from(value: DownloadError) -> Self {
+        match &value {
+            DownloadError::Cancelled => Self::Cancelled,
+            DownloadError::NotFound => Self::NoData,
+            _ => Self::DownloadError(value),
+        }
+    }
+}
+
+impl From<std::io::Error> for UpdateError {
+    fn from(value: std::io::Error) -> Self {
+        if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
+            UpdateError::NoSpace
+        } else {
+            // An I/O error from e.g. tokio::io::copy is most likely a remote storage issue
+            UpdateError::Other(anyhow::anyhow!(value))
+        }
+    }
+}
+
+impl<'a> TenantDownloader<'a> {
+    fn new(
+        conf: &'static PageServerConf,
+        remote_storage: &'a GenericRemoteStorage,
+        secondary_state: &'a SecondaryTenant,
+    ) -> Self {
+        Self {
+            conf,
+            remote_storage,
+            secondary_state,
+        }
+    }
+
+    async fn download(&self) -> Result<(), UpdateError> {
+        debug_assert_current_span_has_tenant_id();
+
+        // For the duration of a download, we must hold the SecondaryTenant::gate, to ensure
+        // cover our access to local storage.
+        let Ok(_guard) = self.secondary_state.gate.enter() else {
+            // Shutting down
+            return Ok(());
+        };
+
+        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+        // Download the tenant's heatmap
+        let heatmap_bytes = tokio::select!(
+            bytes = self.download_heatmap() => {bytes?},
+            _ = self.secondary_state.cancel.cancelled() => return Ok(())
+        );
+
+        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
+
+        // Save the heatmap: this will be useful on restart, allowing us to reconstruct
+        // layer metadata without having to re-download it.
+        let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id);
+
+        let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
+        let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
+        let heatmap_path_bg = heatmap_path.clone();
+        tokio::task::spawn_blocking(move || {
+            tokio::runtime::Handle::current().block_on(async move {
+                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await
+            })
+        })
+        .await
+        .expect("Blocking task is never aborted")
+        .maybe_fatal_err(&context_msg)?;
+
+        tracing::debug!("Wrote local heatmap to {}", heatmap_path);
+
+        // Download the layers in the heatmap
+        for timeline in heatmap.timelines {
+            if self.secondary_state.cancel.is_cancelled() {
+                return Ok(());
+            }
+
+            let timeline_id = timeline.timeline_id;
+            self.download_timeline(timeline)
+                .instrument(tracing::info_span!(
+                    "secondary_download_timeline",
+                    tenant_id=%tenant_shard_id.tenant_id,
+                    shard_id=%tenant_shard_id.shard_slug(),
+                    %timeline_id
+                ))
+                .await?;
+        }
+
+        Ok(())
+    }
+
+    async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
+        debug_assert_current_span_has_tenant_id();
+        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+        // TODO: make download conditional on ETag having changed since last download
+        // (https://github.com/neondatabase/neon/issues/6199)
+        tracing::debug!("Downloading heatmap for secondary tenant",);
+
+        let heatmap_path = remote_heatmap_path(tenant_shard_id);
+
+        let heatmap_bytes = backoff::retry(
+            || async {
+                let download = self
+                    .remote_storage
+                    .download(&heatmap_path)
+                    .await
+                    .map_err(UpdateError::from)?;
+                let mut heatmap_bytes = Vec::new();
+                let mut body = tokio_util::io::StreamReader::new(download.download_stream);
+                let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?;
+                Ok(heatmap_bytes)
+            },
+            |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
+            FAILED_DOWNLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "download heatmap",
+            backoff::Cancel::new(self.secondary_state.cancel.clone(), || {
+                UpdateError::Cancelled
+            }),
+        )
+        .await?;
+
+        SECONDARY_MODE.download_heatmap.inc();
+
+        Ok(heatmap_bytes)
+    }
+
+    async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+        let timeline_path = self
+            .conf
+            .timeline_path(tenant_shard_id, &timeline.timeline_id);
+
+        // Accumulate updates to the state
+        let mut touched = Vec::new();
+
+        // Clone a view of what layers already exist on disk
+        let timeline_state = self
+            .secondary_state
+            .detail
+            .lock()
+            .unwrap()
+            .timelines
+            .get(&timeline.timeline_id)
+            .cloned();
+
+        let timeline_state = match timeline_state {
+            Some(t) => t,
+            None => {
+                // We have no existing state: need to scan local disk for layers first.
+                let timeline_state =
+                    init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
+
+                // Re-acquire detail lock now that we're done with async load from local FS
+                self.secondary_state
+                    .detail
+                    .lock()
+                    .unwrap()
+                    .timelines
+                    .insert(timeline.timeline_id, timeline_state.clone());
+                timeline_state
+            }
+        };
+
+        let layers_in_heatmap = timeline
+            .layers
+            .iter()
+            .map(|l| &l.name)
+            .collect::<HashSet<_>>();
+        let layers_on_disk = timeline_state
+            .on_disk_layers
+            .iter()
+            .map(|l| l.0)
+            .collect::<HashSet<_>>();
+
+        // Remove on-disk layers that are no longer present in heatmap
+        for layer in layers_on_disk.difference(&layers_in_heatmap) {
+            let local_path = timeline_path.join(layer.to_string());
+            tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
+            tokio::fs::remove_file(&local_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)
+                .maybe_fatal_err("Removing secondary layer")?;
+        }
+
+        // Download heatmap layers that are not present on local disk, or update their
+        // access time if they are already present.
+        for layer in timeline.layers {
+            if self.secondary_state.cancel.is_cancelled() {
+                return Ok(());
+            }
+
+            // Existing on-disk layers: just update their access time.
+            if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
+                tracing::debug!("Layer {} is already on disk", layer.name);
+                if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
+                    || on_disk.access_time != layer.access_time
+                {
+                    // We already have this layer on disk.  Update its access time.
+                    tracing::debug!(
+                        "Access time updated for layer {}: {} -> {}",
+                        layer.name,
+                        strftime(&on_disk.access_time),
+                        strftime(&layer.access_time)
+                    );
+                    touched.push(layer);
+                }
+                continue;
+            } else {
+                tracing::debug!("Layer {} not present on disk yet", layer.name);
+            }
+
+            // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
+            // recently than it was evicted.
+            if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
+                if &layer.access_time > evicted_at {
+                    tracing::info!(
+                        "Re-downloading evicted layer {}, accessed at {}, evicted at {}",
+                        layer.name,
+                        strftime(&layer.access_time),
+                        strftime(evicted_at)
+                    );
+                } else {
+                    tracing::trace!(
+                        "Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
+                        layer.name,
+                        strftime(&layer.access_time),
+                        strftime(evicted_at)
+                    );
+                    continue;
+                }
+            }
+
+            // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
+            let downloaded_bytes = match download_layer_file(
+                self.conf,
+                self.remote_storage,
+                *tenant_shard_id,
+                timeline.timeline_id,
+                &layer.name,
+                &LayerFileMetadata::from(&layer.metadata),
+                &self.secondary_state.cancel,
+            )
+            .await
+            {
+                Ok(bytes) => bytes,
+                Err(e) => {
+                    if let DownloadError::NotFound = e {
+                        // A heatmap might be out of date and refer to a layer that doesn't exist any more.
+                        // This is harmless: continue to download the next layer. It is expected during compaction
+                        // GC.
+                        tracing::debug!(
+                            "Skipped downloading missing layer {}, raced with compaction/gc?",
+                            layer.name
+                        );
+                        continue;
+                    } else {
+                        return Err(e.into());
+                    }
+                }
+            };
+
+            if downloaded_bytes != layer.metadata.file_size {
+                let local_path = timeline_path.join(layer.name.to_string());
+
+                tracing::warn!(
+                    "Downloaded layer {} with unexpected size {} != {}.  Removing download.",
+                    layer.name,
+                    downloaded_bytes,
+                    layer.metadata.file_size
+                );
+
+                tokio::fs::remove_file(&local_path)
+                    .await
+                    .or_else(fs_ext::ignore_not_found)?;
+            }
+
+            SECONDARY_MODE.download_layer.inc();
+            touched.push(layer)
+        }
+
+        // Write updates to state to record layers we just downloaded or touched.
+        {
+            let mut detail = self.secondary_state.detail.lock().unwrap();
+            let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
+
+            tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
+
+            for t in touched {
+                use std::collections::hash_map::Entry;
+                match timeline_detail.on_disk_layers.entry(t.name.clone()) {
+                    Entry::Occupied(mut v) => {
+                        v.get_mut().access_time = t.access_time;
+                    }
+                    Entry::Vacant(e) => {
+                        e.insert(OnDiskState::new(
+                            self.conf,
+                            tenant_shard_id,
+                            &timeline.timeline_id,
+                            t.name,
+                            LayerFileMetadata::from(&t.metadata),
+                            t.access_time,
+                        ));
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
+async fn init_timeline_state(
+    conf: &'static PageServerConf,
+    tenant_shard_id: &TenantShardId,
+    heatmap: &HeatMapTimeline,
+) -> SecondaryDetailTimeline {
+    let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
+    let mut detail = SecondaryDetailTimeline::default();
+
+    let mut dir = match tokio::fs::read_dir(&timeline_path).await {
+        Ok(d) => d,
+        Err(e) => {
+            if e.kind() == std::io::ErrorKind::NotFound {
+                let context = format!("Creating timeline directory {timeline_path}");
+                tracing::info!("{}", context);
+                tokio::fs::create_dir_all(&timeline_path)
+                    .await
+                    .fatal_err(&context);
+
+                // No entries to report: drop out.
+                return detail;
+            } else {
+                on_fatal_io_error(&e, &format!("Reading timeline dir {timeline_path}"));
+            }
+        }
+    };
+
+    // As we iterate through layers found on disk, we will look up their metadata from this map.
+    // Layers not present in metadata will be discarded.
+    let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> =
+        heatmap.layers.iter().map(|l| (&l.name, l)).collect();
+
+    while let Some(dentry) = dir
+        .next_entry()
+        .await
+        .fatal_err(&format!("Listing {timeline_path}"))
+    {
+        let dentry_file_name = dentry.file_name();
+        let file_name = dentry_file_name.to_string_lossy();
+        let local_meta = dentry.metadata().await.fatal_err(&format!(
+            "Read metadata on {}",
+            dentry.path().to_string_lossy()
+        ));
+
+        // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
+        if file_name == METADATA_FILE_NAME {
+            continue;
+        }
+
+        match LayerFileName::from_str(&file_name) {
+            Ok(name) => {
+                let remote_meta = heatmap_metadata.get(&name);
+                match remote_meta {
+                    Some(remote_meta) => {
+                        // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
+                        if local_meta.len() != remote_meta.metadata.file_size {
+                            // This should not happen, because we do crashsafe write-then-rename when downloading
+                            // layers, and layers in remote storage are immutable.  Remove the local file because
+                            // we cannot trust it.
+                            tracing::warn!(
+                                "Removing local layer {name} with unexpected local size {} != {}",
+                                local_meta.len(),
+                                remote_meta.metadata.file_size
+                            );
+                        } else {
+                            // We expect the access time to be initialized immediately afterwards, when
+                            // the latest heatmap is applied to the state.
+                            detail.on_disk_layers.insert(
+                                name.clone(),
+                                OnDiskState::new(
+                                    conf,
+                                    tenant_shard_id,
+                                    &heatmap.timeline_id,
+                                    name,
+                                    LayerFileMetadata::from(&remote_meta.metadata),
+                                    remote_meta.access_time,
+                                ),
+                            );
+                        }
+                    }
+                    None => {
+                        // FIXME: consider some optimization when transitioning from attached to secondary: maybe
+                        // wait until we have seen a heatmap that is more recent than the most recent on-disk state?  Otherwise
+                        // we will end up deleting any layers which were created+uploaded more recently than the heatmap.
+                        tracing::info!(
+                            "Removing secondary local layer {} because it's absent in heatmap",
+                            name
+                        );
+                        tokio::fs::remove_file(&dentry.path())
+                            .await
+                            .or_else(fs_ext::ignore_not_found)
+                            .fatal_err(&format!(
+                                "Removing layer {}",
+                                dentry.path().to_string_lossy()
+                            ));
+                    }
+                }
+            }
+            Err(_) => {
+                // Ignore it.
+                tracing::warn!("Unexpected file in timeline directory: {file_name}");
+            }
+        }
+    }
+
+    detail
+}
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index ece2b93ce1..ef01c33e8e 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -1,5 +1,6 @@
 use std::{
     collections::HashMap,
+    pin::Pin,
     sync::{Arc, Weak},
     time::{Duration, Instant},
 };
@@ -7,35 +8,86 @@ use std::{
 use crate::{
     metrics::SECONDARY_MODE,
     tenant::{
-        config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path,
-        secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant,
+        config::AttachmentMode,
+        mgr::TenantManager,
+        remote_timeline_client::remote_heatmap_path,
+        span::debug_assert_current_span_has_tenant_id,
+        tasks::{warn_when_period_overrun, BackgroundLoopKind},
+        Tenant,
     },
 };
 
+use futures::Future;
 use md5;
 use pageserver_api::shard::TenantShardId;
+use rand::Rng;
 use remote_storage::GenericRemoteStorage;
 
-use tokio::task::JoinSet;
+use super::{
+    scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
+    CommandRequest,
+};
 use tokio_util::sync::CancellationToken;
-use tracing::instrument;
-use utils::{backoff, completion::Barrier};
+use tracing::{info_span, instrument, Instrument};
+use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};
 
-use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
+use super::{heatmap::HeatMapTenant, UploadCommand};
 
-/// Period between heatmap uploader walking Tenants to look for work to do.
-/// If any tenants have a heatmap upload period lower than this, it will be adjusted
-/// downward to match.
-const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000);
-const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000);
+pub(super) async fn heatmap_uploader_task(
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
+    background_jobs_can_start: Barrier,
+    cancel: CancellationToken,
+) {
+    let concurrency = tenant_manager.get_conf().heatmap_upload_concurrency;
+
+    let generator = HeatmapUploader {
+        tenant_manager,
+        remote_storage,
+        cancel: cancel.clone(),
+        tenants: HashMap::new(),
+    };
+    let mut scheduler = Scheduler::new(generator, concurrency);
+
+    scheduler
+        .run(command_queue, background_jobs_can_start, cancel)
+        .instrument(info_span!("heatmap_uploader"))
+        .await
+}
+
+/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
+/// handling loop and mutates it as needed: there are no locks here, because that event loop
+/// can hold &mut references to this type throughout.
+struct HeatmapUploader {
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    cancel: CancellationToken,
+
+    tenants: HashMap<TenantShardId, UploaderTenantState>,
+}
 
 struct WriteInProgress {
     barrier: Barrier,
 }
 
+impl RunningJob for WriteInProgress {
+    fn get_barrier(&self) -> Barrier {
+        self.barrier.clone()
+    }
+}
+
 struct UploadPending {
     tenant: Arc<Tenant>,
     last_digest: Option<md5::Digest>,
+    target_time: Option<Instant>,
+    period: Option<Duration>,
+}
+
+impl scheduler::PendingJob for UploadPending {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        self.tenant.get_tenant_shard_id()
+    }
 }
 
 struct WriteComplete {
@@ -45,6 +97,12 @@ struct WriteComplete {
     next_upload: Option<Instant>,
 }
 
+impl scheduler::Completion for WriteComplete {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        &self.tenant_shard_id
+    }
+}
+
 /// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember
 /// when we last did a write.  We only populate this after doing at least one
 /// write for a tenant -- this avoids holding state for tenants that have
@@ -68,267 +126,111 @@ struct UploaderTenantState {
     next_upload: Option<Instant>,
 }
 
-/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
-/// handling loop and mutates it as needed: there are no locks here, because that event loop
-/// can hold &mut references to this type throughout.
-struct HeatmapUploader {
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    cancel: CancellationToken,
+type Scheduler = TenantBackgroundJobs<
+    HeatmapUploader,
+    UploadPending,
+    WriteInProgress,
+    WriteComplete,
+    UploadCommand,
+>;
 
-    tenants: HashMap<TenantShardId, UploaderTenantState>,
-
-    /// Tenants with work to do, for which tasks should be spawned as soon as concurrency
-    /// limits permit it.
-    tenants_pending: std::collections::VecDeque<UploadPending>,
-
-    /// Tenants for which a task in `tasks` has been spawned.
-    tenants_uploading: HashMap<TenantShardId, WriteInProgress>,
-
-    tasks: JoinSet<()>,
-
-    /// Channel for our child tasks to send results to: we use a channel for results rather than
-    /// just getting task results via JoinSet because we need the channel's recv() "sleep until something
-    /// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty"
-    /// behavior.
-    task_result_tx: tokio::sync::mpsc::UnboundedSender<WriteComplete>,
-    task_result_rx: tokio::sync::mpsc::UnboundedReceiver<WriteComplete>,
-
-    concurrent_uploads: usize,
-
-    scheduling_interval: Duration,
-}
-
-/// The uploader task runs a loop that periodically wakes up and schedules tasks for
-/// tenants that require an upload, or handles any commands that have been sent into
-/// `command_queue`.  No I/O is done in this loop: that all happens in the tasks we
-/// spawn.
-///
-/// Scheduling iterations are somewhat infrequent.  However, each one will enqueue
-/// all tenants that require an upload, and in between scheduling iterations we will
-/// continue to spawn new tasks for pending tenants, as our concurrency limit permits.
-///
-/// While we take a CancellationToken here, it is subordinate to the CancellationTokens
-/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise
-/// we might block waiting on a Tenant.
-pub(super) async fn heatmap_uploader_task(
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
-    background_jobs_can_start: Barrier,
-    cancel: CancellationToken,
-) -> anyhow::Result<()> {
-    let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency;
-
-    let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
-
-    let mut uploader = HeatmapUploader {
-        tenant_manager,
-        remote_storage,
-        cancel: cancel.clone(),
-        tasks: JoinSet::new(),
-        tenants: HashMap::new(),
-        tenants_pending: std::collections::VecDeque::new(),
-        tenants_uploading: HashMap::new(),
-        task_result_tx: result_tx,
-        task_result_rx: result_rx,
-        concurrent_uploads,
-        scheduling_interval: DEFAULT_SCHEDULING_INTERVAL,
-    };
-
-    tracing::info!("Waiting for background_jobs_can start...");
-    background_jobs_can_start.wait().await;
-    tracing::info!("background_jobs_can is ready, proceeding.");
-
-    while !cancel.is_cancelled() {
-        // Look for new work: this is relatively expensive because we have to go acquire the lock on
-        // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
-        // require an upload.
-        uploader.schedule_iteration().await?;
-
-        // Between scheduling iterations, we will:
-        //  - Drain any complete tasks and spawn pending tasks
-        //  - Handle incoming administrative commands
-        //  - Check our cancellation token
-        let next_scheduling_iteration = Instant::now()
-            .checked_add(uploader.scheduling_interval)
-            .unwrap_or_else(|| {
-                tracing::warn!(
-                    "Scheduling interval invalid ({}s), running immediately!",
-                    uploader.scheduling_interval.as_secs_f64()
-                );
-                Instant::now()
-            });
-        loop {
-            tokio::select! {
-                _ = cancel.cancelled() => {
-                    // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
-                    tracing::info!("Heatmap uploader joining tasks");
-                    while let Some(_r) = uploader.tasks.join_next().await {};
-                    tracing::info!("Heatmap uploader terminating");
-
-                    break;
-                },
-                _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
-                    tracing::debug!("heatmap_uploader_task: woke for scheduling interval");
-                    break;},
-                cmd = command_queue.recv() => {
-                    tracing::debug!("heatmap_uploader_task: woke for command queue");
-                    let cmd = match cmd {
-                        Some(c) =>c,
-                        None => {
-                            // SecondaryController was destroyed, and this has raced with
-                            // our CancellationToken
-                            tracing::info!("Heatmap uploader terminating");
-                            cancel.cancel();
-                            break;
-                        }
-                    };
-
-                    let CommandRequest{
-                        response_tx,
-                        payload
-                    } = cmd;
-                    uploader.handle_command(payload, response_tx);
-                },
-                _ = uploader.process_next_completion() => {
-                    if !cancel.is_cancelled() {
-                        uploader.spawn_pending();
-                    }
-                }
-            }
-        }
-    }
-
-    Ok(())
-}
-
-impl HeatmapUploader {
-    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
-    async fn schedule_iteration(&mut self) -> anyhow::Result<()> {
+#[async_trait::async_trait]
+impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
+    for HeatmapUploader
+{
+    async fn schedule(&mut self) -> SchedulingResult<UploadPending> {
         // Cull any entries in self.tenants whose Arc<Tenant> is gone
         self.tenants
             .retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some());
 
-        // The priority order of previously scheduled work may be invalidated by current state: drop
-        // all pending work (it will be re-scheduled if still needed)
-        self.tenants_pending.clear();
-
-        // Used a fixed 'now' through the following loop, for efficiency and fairness.
         let now = Instant::now();
 
-        // While iterating over the potentially-long list of tenants, we will periodically yield
-        // to avoid blocking executor.
-        const YIELD_ITERATIONS: usize = 1000;
+        let mut result = SchedulingResult {
+            jobs: Vec::new(),
+            want_interval: None,
+        };
 
-        // Iterate over tenants looking for work to do.
         let tenants = self.tenant_manager.get_attached_active_tenant_shards();
-        for (i, tenant) in tenants.into_iter().enumerate() {
-            // Process is shutting down, drop out
-            if self.cancel.is_cancelled() {
-                return Ok(());
-            }
 
-            // Skip tenants that already have a write in flight
-            if self
-                .tenants_uploading
-                .contains_key(tenant.get_tenant_shard_id())
-            {
-                continue;
-            }
+        yielding_loop(1000, &self.cancel, tenants.into_iter(), |tenant| {
+            let period = match tenant.get_heatmap_period() {
+                None => {
+                    // Heatmaps are disabled for this tenant
+                    return;
+                }
+                Some(period) => {
+                    // If any tenant has asked for uploads more frequent than our scheduling interval,
+                    // reduce it to match so that we can keep up.  This is mainly useful in testing, where
+                    // we may set rather short intervals.
+                    result.want_interval = match result.want_interval {
+                        None => Some(period),
+                        Some(existing) => Some(std::cmp::min(period, existing)),
+                    };
 
-            self.maybe_schedule_upload(&now, tenant);
+                    period
+                }
+            };
 
-            if i + 1 % YIELD_ITERATIONS == 0 {
-                tokio::task::yield_now().await;
-            }
-        }
-
-        // Spawn tasks for as many of our pending tenants as we can.
-        self.spawn_pending();
-
-        Ok(())
-    }
-
-    ///
-    /// Cancellation: this method is cancel-safe.
-    async fn process_next_completion(&mut self) {
-        match self.task_result_rx.recv().await {
-            Some(r) => {
-                self.on_completion(r);
-            }
-            None => {
-                unreachable!("Result sender is stored on Self");
-            }
-        }
-    }
-
-    /// The 'maybe' refers to the tenant's state: whether it is configured
-    /// for heatmap uploads at all, and whether sufficient time has passed
-    /// since the last upload.
-    fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc<Tenant>) {
-        match tenant.get_heatmap_period() {
-            None => {
-                // Heatmaps are disabled for this tenant
+            // Stale attachments do not upload anything: if we are in this state, there is probably some
+            // other attachment in mode Single or Multi running on another pageserver, and we don't
+            // want to thrash and overwrite their heatmap uploads.
+            if tenant.get_attach_mode() == AttachmentMode::Stale {
                 return;
             }
-            Some(period) => {
-                // If any tenant has asked for uploads more frequent than our scheduling interval,
-                // reduce it to match so that we can keep up.  This is mainly useful in testing, where
-                // we may set rather short intervals.
-                if period < self.scheduling_interval {
-                    self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL);
-                }
+
+            // Create an entry in self.tenants if one doesn't already exist: this will later be updated
+            // with the completion time in on_completion.
+            let state = self
+                .tenants
+                .entry(*tenant.get_tenant_shard_id())
+                .or_insert_with(|| {
+                    let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period);
+
+                    UploaderTenantState {
+                        tenant: Arc::downgrade(&tenant),
+                        last_upload: None,
+                        next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)),
+                        last_digest: None,
+                    }
+                });
+
+            // Decline to do the upload if insufficient time has passed
+            if state.next_upload.map(|nu| nu > now).unwrap_or(false) {
+                return;
             }
-        }
 
-        // Stale attachments do not upload anything: if we are in this state, there is probably some
-        // other attachment in mode Single or Multi running on another pageserver, and we don't
-        // want to thrash and overwrite their heatmap uploads.
-        if tenant.get_attach_mode() == AttachmentMode::Stale {
-            return;
-        }
-
-        // Create an entry in self.tenants if one doesn't already exist: this will later be updated
-        // with the completion time in on_completion.
-        let state = self
-            .tenants
-            .entry(*tenant.get_tenant_shard_id())
-            .or_insert_with(|| UploaderTenantState {
-                tenant: Arc::downgrade(&tenant),
-                last_upload: None,
-                next_upload: Some(Instant::now()),
-                last_digest: None,
+            let last_digest = state.last_digest;
+            result.jobs.push(UploadPending {
+                tenant,
+                last_digest,
+                target_time: state.next_upload,
+                period: Some(period),
             });
+        })
+        .await
+        .ok();
 
-        // Decline to do the upload if insufficient time has passed
-        if state.next_upload.map(|nu| &nu > now).unwrap_or(false) {
-            return;
-        }
+        result
+    }
 
-        let last_digest = state.last_digest;
-        self.tenants_pending.push_back(UploadPending {
+    fn spawn(
+        &mut self,
+        job: UploadPending,
+    ) -> (
+        WriteInProgress,
+        Pin<Box<dyn Future<Output = WriteComplete> + Send>>,
+    ) {
+        let UploadPending {
             tenant,
             last_digest,
-        })
-    }
+            target_time,
+            period,
+        } = job;
 
-    fn spawn_pending(&mut self) {
-        while !self.tenants_pending.is_empty()
-            && self.tenants_uploading.len() < self.concurrent_uploads
-        {
-            // unwrap: loop condition includes !is_empty()
-            let pending = self.tenants_pending.pop_front().unwrap();
-            self.spawn_upload(pending.tenant, pending.last_digest);
-        }
-    }
-
-    fn spawn_upload(&mut self, tenant: Arc<Tenant>, last_digest: Option<md5::Digest>) {
         let remote_storage = self.remote_storage.clone();
-        let tenant_shard_id = *tenant.get_tenant_shard_id();
         let (completion, barrier) = utils::completion::channel();
-        let result_tx = self.task_result_tx.clone();
-        self.tasks.spawn(async move {
+        let tenant_shard_id = *tenant.get_tenant_shard_id();
+        (WriteInProgress { barrier }, Box::pin(async move {
             // Guard for the barrier in [`WriteInProgress`]
             let _completion = completion;
 
@@ -362,22 +264,47 @@ impl HeatmapUploader {
             };
 
             let now = Instant::now();
+
+            // If the job had a target execution time, we may check our final execution
+            // time against that for observability purposes.
+            if let (Some(target_time), Some(period)) = (target_time, period) {
+                // Elapsed time includes any scheduling lag as well as the execution of the job
+                let elapsed = now.duration_since(target_time);
+
+                warn_when_period_overrun(elapsed, period, BackgroundLoopKind::HeatmapUpload);
+            }
+
             let next_upload = tenant
                 .get_heatmap_period()
                 .and_then(|period| now.checked_add(period));
 
-            result_tx
-                .send(WriteComplete {
+            WriteComplete {
                     tenant_shard_id: *tenant.get_tenant_shard_id(),
                     completed_at: now,
                     digest,
                     next_upload,
-                })
-                .ok();
-        });
+                }
+        }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
+    }
 
-        self.tenants_uploading
-            .insert(tenant_shard_id, WriteInProgress { barrier });
+    fn on_command(&mut self, command: UploadCommand) -> anyhow::Result<UploadPending> {
+        let tenant_shard_id = command.get_tenant_shard_id();
+
+        tracing::info!(
+            tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+            "Starting heatmap write on command");
+        let tenant = self
+            .tenant_manager
+            .get_attached_tenant_shard(*tenant_shard_id, true)
+            .map_err(|e| anyhow::anyhow!(e))?;
+
+        Ok(UploadPending {
+            // Ignore our state for last digest: this forces an upload even if nothing has changed
+            last_digest: None,
+            tenant,
+            target_time: None,
+            period: None,
+        })
     }
 
     #[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))]
@@ -389,7 +316,6 @@ impl HeatmapUploader {
             digest,
             next_upload,
         } = completion;
-        self.tenants_uploading.remove(&tenant_shard_id);
         use std::collections::hash_map::Entry;
         match self.tenants.entry(tenant_shard_id) {
             Entry::Vacant(_) => {
@@ -402,69 +328,6 @@ impl HeatmapUploader {
             }
         }
     }
-
-    fn handle_command(
-        &mut self,
-        command: UploadCommand,
-        response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
-    ) {
-        match command {
-            UploadCommand::Upload(tenant_shard_id) => {
-                // If an upload was ongoing for this tenant, let it finish first.
-                let barrier = if let Some(writing_state) =
-                    self.tenants_uploading.get(&tenant_shard_id)
-                {
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Waiting for heatmap write to complete");
-                    writing_state.barrier.clone()
-                } else {
-                    // Spawn the upload then immediately wait for it.  This will block processing of other commands and
-                    // starting of other background work.
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Starting heatmap write on command");
-                    let tenant = match self
-                        .tenant_manager
-                        .get_attached_tenant_shard(tenant_shard_id, true)
-                    {
-                        Ok(t) => t,
-                        Err(e) => {
-                            // Drop result of send: we don't care if caller dropped their receiver
-                            drop(response_tx.send(CommandResponse {
-                                result: Err(e.into()),
-                            }));
-                            return;
-                        }
-                    };
-                    self.spawn_upload(tenant, None);
-                    let writing_state = self
-                        .tenants_uploading
-                        .get(&tenant_shard_id)
-                        .expect("We just inserted this");
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Waiting for heatmap upload to complete");
-
-                    writing_state.barrier.clone()
-                };
-
-                // This task does no I/O: it only listens for a barrier's completion and then
-                // sends to the command response channel.  It is therefore safe to spawn this without
-                // any gates/task_mgr hooks.
-                tokio::task::spawn(async move {
-                    barrier.wait().await;
-
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Heatmap upload complete");
-
-                    // Drop result of send: we don't care if caller dropped their receiver
-                    drop(response_tx.send(CommandResponse { result: Ok(()) }))
-                });
-            }
-        }
-    }
 }
 
 enum UploadHeatmapOutcome {
@@ -487,7 +350,6 @@ enum UploadHeatmapError {
 
 /// The inner upload operation.  This will skip if `last_digest` is Some and matches the digest
 /// of the object we would have uploaded.
-#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))]
 async fn upload_tenant_heatmap(
     remote_storage: GenericRemoteStorage,
     tenant: &Arc<Tenant>,
diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs
new file mode 100644
index 0000000000..cf01a100d9
--- /dev/null
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -0,0 +1,361 @@
+use async_trait;
+use futures::Future;
+use std::{
+    collections::HashMap,
+    marker::PhantomData,
+    pin::Pin,
+    time::{Duration, Instant},
+};
+
+use pageserver_api::shard::TenantShardId;
+use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
+use utils::{completion::Barrier, yielding_loop::yielding_loop};
+
+use super::{CommandRequest, CommandResponse};
+
+/// Scheduling interval is the time between calls to JobGenerator::schedule.
+/// When we schedule jobs, the job generator may provide a hint of its preferred
+/// interval, which we will respect within these intervals.
+const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10);
+const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1);
+
+/// Scheduling helper for background work across many tenants.
+///
+/// Systems that need to run background work across many tenants may use this type
+/// to schedule jobs within a concurrency limit, along with their own [`JobGenerator`]
+/// implementation to provide the work to execute.  This is a simple scheduler that just
+/// polls the generator for outstanding work, replacing its queue of pending work with
+/// what the generator yields on each call: the job generator can change its mind about
+/// the order of jobs between calls.  The job generator is notified when jobs complete,
+/// and additionally may expose a command hook to generate jobs on-demand (e.g. to implement
+/// admin APIs).
+///
+/// For an example see [`crate::tenant::secondary::heatmap_uploader`]
+///
+/// G: A JobGenerator that this scheduler will poll to find pending jobs
+/// PJ: 'Pending Job': type for job descriptors that are ready to run
+/// RJ: 'Running Job' type' for jobs that have been spawned
+/// C : 'Completion' type that spawned jobs will send when they finish
+/// CMD: 'Command' type that the job generator will accept to create jobs on-demand
+pub(super) struct TenantBackgroundJobs<G, PJ, RJ, C, CMD>
+where
+    G: JobGenerator<PJ, RJ, C, CMD>,
+    C: Completion,
+    PJ: PendingJob,
+    RJ: RunningJob,
+{
+    generator: G,
+
+    /// Ready to run.  Will progress to `running` once concurrent limit is satisfied, or
+    /// be removed on next scheduling pass.
+    pending: std::collections::VecDeque<PJ>,
+
+    /// Tasks currently running in Self::tasks for these tenants.  Check this map
+    /// before pushing more work into pending for the same tenant.
+    running: HashMap<TenantShardId, RJ>,
+
+    tasks: JoinSet<C>,
+
+    concurrency: usize,
+
+    /// How often we would like schedule_interval to be called.
+    pub(super) scheduling_interval: Duration,
+
+    _phantom: PhantomData<(PJ, RJ, C, CMD)>,
+}
+
+#[async_trait::async_trait]
+pub(crate) trait JobGenerator<PJ, RJ, C, CMD>
+where
+    C: Completion,
+    PJ: PendingJob,
+    RJ: RunningJob,
+{
+    /// Called at each scheduling interval.  Return a list of jobs to run, most urgent first.
+    ///
+    /// This function may be expensive (e.g. walk all tenants), but should not do any I/O.
+    /// Implementations should take care to yield the executor periodically if running
+    /// very long loops.
+    ///
+    /// Yielding a job here does _not_ guarantee that it will run: if the queue of pending
+    /// jobs is not drained by the next scheduling interval, pending jobs will be cleared
+    /// and re-generated.
+    async fn schedule(&mut self) -> SchedulingResult<PJ>;
+
+    /// Called when a pending job is ready to be run.
+    ///
+    /// The job generation provides a future, and a RJ (Running Job) descriptor that tracks it.
+    fn spawn(&mut self, pending_job: PJ) -> (RJ, Pin<Box<dyn Future<Output = C> + Send>>);
+
+    /// Called when a job previously spawned with spawn() transmits its completion
+    fn on_completion(&mut self, completion: C);
+
+    /// Called when a command is received.  A job will be spawned immediately if the return
+    /// value is Some, ignoring concurrency limits and the pending queue.
+    fn on_command(&mut self, cmd: CMD) -> anyhow::Result<PJ>;
+}
+
+/// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling
+pub(super) struct SchedulingResult<PJ> {
+    pub(super) jobs: Vec<PJ>,
+    /// The job generator would like to be called again this soon
+    pub(super) want_interval: Option<Duration>,
+}
+
+/// See [`TenantBackgroundJobs`].
+pub(super) trait PendingJob {
+    fn get_tenant_shard_id(&self) -> &TenantShardId;
+}
+
+/// See [`TenantBackgroundJobs`].
+pub(super) trait Completion: Send + 'static {
+    fn get_tenant_shard_id(&self) -> &TenantShardId;
+}
+
+/// See [`TenantBackgroundJobs`].
+pub(super) trait RunningJob {
+    fn get_barrier(&self) -> Barrier;
+}
+
+impl<G, PJ, RJ, C, CMD> TenantBackgroundJobs<G, PJ, RJ, C, CMD>
+where
+    C: Completion,
+    PJ: PendingJob,
+    RJ: RunningJob,
+    G: JobGenerator<PJ, RJ, C, CMD>,
+{
+    pub(super) fn new(generator: G, concurrency: usize) -> Self {
+        Self {
+            generator,
+            pending: std::collections::VecDeque::new(),
+            running: HashMap::new(),
+            tasks: JoinSet::new(),
+            concurrency,
+            scheduling_interval: MAX_SCHEDULING_INTERVAL,
+            _phantom: PhantomData,
+        }
+    }
+
+    pub(super) async fn run(
+        &mut self,
+        mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<CMD>>,
+        background_jobs_can_start: Barrier,
+        cancel: CancellationToken,
+    ) {
+        tracing::info!("Waiting for background_jobs_can start...");
+        background_jobs_can_start.wait().await;
+        tracing::info!("background_jobs_can is ready, proceeding.");
+
+        while !cancel.is_cancelled() {
+            // Look for new work: this is relatively expensive because we have to go acquire the lock on
+            // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
+            // require an upload.
+            self.schedule_iteration(&cancel).await;
+
+            if cancel.is_cancelled() {
+                return;
+            }
+
+            // Schedule some work, if concurrency limit permits it
+            self.spawn_pending();
+
+            // Between scheduling iterations, we will:
+            //  - Drain any complete tasks and spawn pending tasks
+            //  - Handle incoming administrative commands
+            //  - Check our cancellation token
+            let next_scheduling_iteration = Instant::now()
+                .checked_add(self.scheduling_interval)
+                .unwrap_or_else(|| {
+                    tracing::warn!(
+                        "Scheduling interval invalid ({}s)",
+                        self.scheduling_interval.as_secs_f64()
+                    );
+                    // unwrap(): this constant is small, cannot fail to add to time unless
+                    // we are close to the end of the universe.
+                    Instant::now().checked_add(MIN_SCHEDULING_INTERVAL).unwrap()
+                });
+            loop {
+                tokio::select! {
+                    _ = cancel.cancelled() => {
+                        tracing::info!("joining tasks");
+                        // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
+                        // It is the callers responsibility to make sure that the tasks they scheduled
+                        // respect an appropriate cancellation token, to shut down promptly.  It is only
+                        // safe to wait on joining these tasks because we can see the cancellation token
+                        // has been set.
+                        while let Some(_r) = self.tasks.join_next().await {}
+                        tracing::info!("terminating on cancellation token.");
+
+                        break;
+                    },
+                    _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
+                        tracing::debug!("woke for scheduling interval");
+                        break;},
+                    cmd = command_queue.recv() => {
+                        tracing::debug!("woke for command queue");
+                        let cmd = match cmd {
+                            Some(c) =>c,
+                            None => {
+                                // SecondaryController was destroyed, and this has raced with
+                                // our CancellationToken
+                                tracing::info!("terminating on command queue destruction");
+                                cancel.cancel();
+                                break;
+                            }
+                        };
+
+                        let CommandRequest{
+                            response_tx,
+                            payload
+                        } = cmd;
+                        self.handle_command(payload, response_tx);
+                    },
+                    _ = async {
+                        let completion = self.process_next_completion().await;
+                        match completion {
+                            Some(c) => {
+                                self.generator.on_completion(c);
+                                if !cancel.is_cancelled() {
+                                    self.spawn_pending();
+                                }
+                            },
+                            None => {
+                                // Nothing is running, so just wait: expect that this future
+                                // will be dropped when something in the outer select! fires.
+                                cancel.cancelled().await;
+                            }
+                        }
+
+                     } => {}
+                }
+            }
+        }
+    }
+
+    fn do_spawn(&mut self, job: PJ) {
+        let tenant_shard_id = *job.get_tenant_shard_id();
+        let (in_progress, fut) = self.generator.spawn(job);
+
+        self.tasks.spawn(fut);
+
+        self.running.insert(tenant_shard_id, in_progress);
+    }
+
+    /// For all pending tenants that are elegible for execution, spawn their task.
+    ///
+    /// Caller provides the spawn operation, we track the resulting execution.
+    fn spawn_pending(&mut self) {
+        while !self.pending.is_empty() && self.running.len() < self.concurrency {
+            // unwrap: loop condition includes !is_empty()
+            let pending = self.pending.pop_front().unwrap();
+            self.do_spawn(pending);
+        }
+    }
+
+    /// For administrative commands: skip the pending queue, ignore concurrency limits
+    fn spawn_now(&mut self, job: PJ) -> &RJ {
+        let tenant_shard_id = *job.get_tenant_shard_id();
+        self.do_spawn(job);
+        self.running
+            .get(&tenant_shard_id)
+            .expect("We just inserted this")
+    }
+
+    /// Wait until the next task completes, and handle its completion
+    ///
+    /// Cancellation: this method is cancel-safe.
+    async fn process_next_completion(&mut self) -> Option<C> {
+        match self.tasks.join_next().await {
+            Some(r) => {
+                // We use a channel to drive completions, but also
+                // need to drain the JoinSet to avoid completed tasks
+                // accumulating.  These calls are 1:1 because every task
+                // we spawn into this joinset submits is result to the channel.
+                let completion = r.expect("Panic in background task");
+
+                self.running.remove(completion.get_tenant_shard_id());
+                Some(completion)
+            }
+            None => {
+                // Nothing is running, so we have nothing to wait for.  We may drop out: the
+                // main even loop will call us again after the next time it has run something.
+                None
+            }
+        }
+    }
+
+    /// Convert the command into a pending job, spawn it, and when the spawned
+    /// job completes, send the result down `response_tx`.
+    fn handle_command(
+        &mut self,
+        cmd: CMD,
+        response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
+    ) {
+        let job = match self.generator.on_command(cmd) {
+            Ok(j) => j,
+            Err(e) => {
+                response_tx.send(CommandResponse { result: Err(e) }).ok();
+                return;
+            }
+        };
+
+        let tenant_shard_id = job.get_tenant_shard_id();
+        let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
+            barrier
+        } else {
+            let running = self.spawn_now(job);
+            running.get_barrier().clone()
+        };
+
+        // This task does no I/O: it only listens for a barrier's completion and then
+        // sends to the command response channel.  It is therefore safe to spawn this without
+        // any gates/task_mgr hooks.
+        tokio::task::spawn(async move {
+            barrier.wait().await;
+
+            response_tx.send(CommandResponse { result: Ok(()) }).ok();
+        });
+    }
+
+    fn get_running(&self, tenant_shard_id: &TenantShardId) -> Option<Barrier> {
+        self.running.get(tenant_shard_id).map(|r| r.get_barrier())
+    }
+
+    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
+    ///
+    /// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::Tenant`] or [`crate::tenant::secondary::SecondaryTenant`]
+    ///
+    /// This function resets the pending list: it is assumed that the caller may change their mind about
+    /// which tenants need work between calls to schedule_iteration.
+    async fn schedule_iteration(&mut self, cancel: &CancellationToken) {
+        let SchedulingResult {
+            jobs,
+            want_interval,
+        } = self.generator.schedule().await;
+
+        // Adjust interval based on feedback from the job generator
+        if let Some(want_interval) = want_interval {
+            // Calculation uses second granularity: this scheduler is not intended for high frequency tasks
+            self.scheduling_interval = Duration::from_secs(std::cmp::min(
+                std::cmp::max(MIN_SCHEDULING_INTERVAL.as_secs(), want_interval.as_secs()),
+                MAX_SCHEDULING_INTERVAL.as_secs(),
+            ));
+        }
+
+        // The priority order of previously scheduled work may be invalidated by current state: drop
+        // all pending work (it will be re-scheduled if still needed)
+        self.pending.clear();
+
+        // While iterating over the potentially-long list of tenants, we will periodically yield
+        // to avoid blocking executor.
+        yielding_loop(1000, cancel, jobs.into_iter(), |job| {
+            // Skip tenants that already have a write in flight
+            if !self.running.contains_key(job.get_tenant_shard_id()) {
+                self.pending.push_back(job);
+            }
+        })
+        .await
+        .ok();
+    }
+}
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 003cf0e92b..7c9103eea8 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -23,7 +23,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::ops::Range;
-use tokio::sync::RwLock;
+use tokio::sync::{RwLock, RwLockWriteGuard};
 
 use super::{DeltaLayerWriter, ResidentLayer};
 
@@ -246,16 +246,43 @@ impl InMemoryLayer {
 
     /// Common subroutine of the public put_wal_record() and put_page_image() functions.
     /// Adds the page version to the in-memory tree
-    pub async fn put_value(
+    pub(crate) async fn put_value(
         &self,
         key: Key,
         lsn: Lsn,
         val: &Value,
         ctx: &RequestContext,
     ) -> Result<()> {
-        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-        let inner: &mut _ = &mut *self.inner.write().await;
+        let mut inner = self.inner.write().await;
         self.assert_writable();
+        self.put_value_locked(&mut inner, key, lsn, val, ctx).await
+    }
+
+    pub(crate) async fn put_values(
+        &self,
+        values: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        let mut inner = self.inner.write().await;
+        self.assert_writable();
+        for (key, vals) in values {
+            for (lsn, val) in vals {
+                self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
+                    .await?;
+            }
+        }
+        Ok(())
+    }
+
+    async fn put_value_locked(
+        &self,
+        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
+        key: Key,
+        lsn: Lsn,
+        val: &Value,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
 
         let off = {
             // Avoid doing allocations for "small" values.
@@ -264,7 +291,7 @@ impl InMemoryLayer {
             let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
             buf.clear();
             val.ser_into(&mut buf)?;
-            inner
+            locked_inner
                 .file
                 .write_blob(
                     &buf,
@@ -275,7 +302,7 @@ impl InMemoryLayer {
                 .await?
         };
 
-        let vec_map = inner.index.entry(key).or_default();
+        let vec_map = locked_inner.index.entry(key).or_default();
         let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
         if old.is_some() {
             // We already had an entry for this LSN. That's odd..
@@ -285,13 +312,11 @@ impl InMemoryLayer {
         Ok(())
     }
 
-    pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
+    pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
         // TODO: Currently, we just leak the storage for any deleted keys
-
         Ok(())
     }
 
-    /// Make the layer non-writeable. Only call once.
     /// Records the end_lsn for non-dropped layers.
     /// `end_lsn` is exclusive
     pub async fn freeze(&self, end_lsn: Lsn) {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 8ae911b31e..f5adf9d639 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1118,6 +1118,7 @@ impl LayerInner {
                         tracing::info!("evicted layer after unknown residence period");
                     }
                 }
+                timeline.metrics.evictions.inc();
                 timeline
                     .metrics
                     .resident_physical_size_sub(self.desc.file_size);
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 7ff1873eda..aa5894cc37 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -45,6 +45,8 @@ pub(crate) enum BackgroundLoopKind {
     ConsumptionMetricsCollectMetrics,
     ConsumptionMetricsSyntheticSizeWorker,
     InitialLogicalSizeCalculation,
+    HeatmapUpload,
+    SecondaryDownload,
 }
 
 impl BackgroundLoopKind {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1e84fa1848..24a92859b7 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -373,15 +373,20 @@ pub struct GcInfo {
 }
 
 /// An error happened in a get() operation.
-#[derive(thiserror::Error)]
-pub enum PageReconstructError {
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum PageReconstructError {
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 
+    #[error("Ancestor LSN wait error: {0}")]
+    AncestorLsnTimeout(#[from] WaitLsnError),
+
     /// The operation was cancelled
+    #[error("Cancelled")]
     Cancelled,
 
     /// The ancestor of this is being stopped
+    #[error("ancestor timeline {0} is being stopped")]
     AncestorStopping(TimelineId),
 
     /// An error happened replaying WAL records
@@ -402,32 +407,6 @@ enum FlushLayerError {
     Other(#[from] anyhow::Error),
 }
 
-impl std::fmt::Debug for PageReconstructError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
-        match self {
-            Self::Other(err) => err.fmt(f),
-            Self::Cancelled => write!(f, "cancelled"),
-            Self::AncestorStopping(timeline_id) => {
-                write!(f, "ancestor timeline {timeline_id} is being stopped")
-            }
-            Self::WalRedo(err) => err.fmt(f),
-        }
-    }
-}
-
-impl std::fmt::Display for PageReconstructError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
-        match self {
-            Self::Other(err) => err.fmt(f),
-            Self::Cancelled => write!(f, "cancelled"),
-            Self::AncestorStopping(timeline_id) => {
-                write!(f, "ancestor timeline {timeline_id} is being stopped")
-            }
-            Self::WalRedo(err) => err.fmt(f),
-        }
-    }
-}
-
 #[derive(Clone, Copy)]
 pub enum LogicalSizeCalculationCause {
     Initial,
@@ -452,6 +431,21 @@ impl std::fmt::Debug for Timeline {
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum WaitLsnError {
+    // Called on a timeline which is shutting down
+    #[error("Shutdown")]
+    Shutdown,
+
+    // Called on an timeline not in active state or shutting down
+    #[error("Bad state (not active)")]
+    BadState,
+
+    // Timeout expired while waiting for LSN to catch up with goal.
+    #[error("{0}")]
+    Timeout(String),
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -486,7 +480,7 @@ impl Timeline {
     /// # Cancel-Safety
     ///
     /// This method is cancellation-safe.
-    pub async fn get(
+    pub(crate) async fn get(
         &self,
         key: Key,
         lsn: Lsn,
@@ -496,6 +490,11 @@ impl Timeline {
             return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
         }
 
+        // This check is debug-only because of the cost of hashing, and because it's a double-check: we
+        // already checked the key against the shard_identity when looking up the Timeline from
+        // page_service.
+        debug_assert!(!self.shard_identity.is_key_disposable(&key));
+
         // XXX: structured stats collection for layer eviction here.
         trace!(
             "get page request for {}@{} from task kind {:?}",
@@ -629,24 +628,28 @@ impl Timeline {
     /// You should call this before any of the other get_* or list_* functions. Calling
     /// those functions with an LSN that has been processed yet is an error.
     ///
-    pub async fn wait_lsn(
+    pub(crate) async fn wait_lsn(
         &self,
         lsn: Lsn,
         _ctx: &RequestContext, /* Prepare for use by cancellation */
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline");
+    ) -> Result<(), WaitLsnError> {
+        if self.cancel.is_cancelled() {
+            return Err(WaitLsnError::Shutdown);
+        } else if !self.is_active() {
+            return Err(WaitLsnError::BadState);
+        }
 
         // This should never be called from the WAL receiver, because that could lead
         // to a deadlock.
-        anyhow::ensure!(
+        debug_assert!(
             task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
             "wait_lsn cannot be called in WAL receiver"
         );
-        anyhow::ensure!(
+        debug_assert!(
             task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
             "wait_lsn cannot be called in WAL receiver"
         );
-        anyhow::ensure!(
+        debug_assert!(
             task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
             "wait_lsn cannot be called in WAL receiver"
         );
@@ -660,18 +663,22 @@ impl Timeline {
         {
             Ok(()) => Ok(()),
             Err(e) => {
-                // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
-                drop(_timer);
-                let walreceiver_status = self.walreceiver_status();
-                Err(anyhow::Error::new(e).context({
-                    format!(
+                use utils::seqwait::SeqWaitError::*;
+                match e {
+                    Shutdown => Err(WaitLsnError::Shutdown),
+                    Timeout => {
+                        // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
+                        drop(_timer);
+                        let walreceiver_status = self.walreceiver_status();
+                        Err(WaitLsnError::Timeout(format!(
                         "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
                         lsn,
                         self.get_last_record_lsn(),
                         self.get_disk_consistent_lsn(),
                         walreceiver_status,
-                    )
-                }))
+                    )))
+                    }
+                }
             }
         }
     }
@@ -1459,6 +1466,7 @@ impl Timeline {
                 max_lsn_wal_lag,
                 auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
                 availability_zone: self.conf.availability_zone.clone(),
+                ingest_batch_size: self.conf.ingest_batch_size,
             },
             broker_client,
             ctx,
@@ -2223,13 +2231,13 @@ impl Timeline {
                     return Err(layer_traversal_error(
                         if cfg!(test) {
                             format!(
-                                "could not find data for key {} at LSN {}, for request at LSN {}\n{}",
-                                key, cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
+                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
+                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
                             )
                         } else {
                             format!(
-                                "could not find data for key {} at LSN {}, for request at LSN {}",
-                                key, cont_lsn, request_lsn
+                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
+                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
                             )
                         },
                         traversal_path,
@@ -2289,11 +2297,12 @@ impl Timeline {
                 ancestor
                     .wait_lsn(timeline.ancestor_lsn, ctx)
                     .await
-                    .with_context(|| {
-                        format!(
-                            "wait for lsn {} on ancestor timeline_id={}",
-                            timeline.ancestor_lsn, ancestor.timeline_id
-                        )
+                    .map_err(|e| match e {
+                        e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e),
+                        WaitLsnError::Shutdown => PageReconstructError::Cancelled,
+                        e @ WaitLsnError::BadState => {
+                            PageReconstructError::Other(anyhow::anyhow!(e))
+                        }
                     })?;
 
                 timeline_owned = ancestor;
@@ -2471,9 +2480,27 @@ impl Timeline {
         Ok(())
     }
 
-    async fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
-        let layer = self.get_layer_for_write(lsn).await?;
-        layer.put_tombstone(key_range, lsn).await?;
+    async fn put_values(
+        &self,
+        values: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // Pick the first LSN in the batch to get the layer to write to.
+        for lsns in values.values() {
+            if let Some((lsn, _)) = lsns.first() {
+                let layer = self.get_layer_for_write(*lsn).await?;
+                layer.put_values(values, ctx).await?;
+                break;
+            }
+        }
+        Ok(())
+    }
+
+    async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        if let Some((_, lsn)) = tombstones.first() {
+            let layer = self.get_layer_for_write(*lsn).await?;
+            layer.put_tombstones(tombstones).await?;
+        }
         Ok(())
     }
 
@@ -3035,6 +3062,15 @@ impl Timeline {
                 for range in &partition.ranges {
                     let mut key = range.start;
                     while key < range.end {
+                        if self.shard_identity.is_key_disposable(&key) {
+                            debug!(
+                                "Dropping key {} during compaction (it belongs on shard {:?})",
+                                key,
+                                self.shard_identity.get_shard_number(&key)
+                            );
+                            key = key.next();
+                            continue;
+                        }
                         let img = match self.get(key, lsn, ctx).await {
                             Ok(img) => img,
                             Err(err) => {
@@ -3061,6 +3097,7 @@ impl Timeline {
                                 }
                             }
                         };
+
                         image_layer_writer.put_image(key, &img).await?;
                         key = key.next();
                     }
@@ -3631,7 +3668,15 @@ impl Timeline {
                 )))
             });
 
-            writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+            if !self.shard_identity.is_key_disposable(&key) {
+                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+            } else {
+                debug!(
+                    "Dropping key {} during compaction (it belongs on shard {:?})",
+                    key,
+                    self.shard_identity.get_shard_number(&key)
+                );
+            }
 
             if !new_layers.is_empty() {
                 fail_point!("after-timeline-compacted-first-L1");
@@ -4186,7 +4231,7 @@ impl Timeline {
                     .context("Failed to reconstruct a page image:")
                 {
                     Ok(img) => img,
-                    Err(e) => return Err(PageReconstructError::from(e)),
+                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
                 };
 
                 if img.len() == page_cache::PAGE_SZ {
@@ -4529,8 +4574,16 @@ impl<'a> TimelineWriter<'a> {
         self.tl.put_value(key, lsn, value, ctx).await
     }
 
-    pub async fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
-        self.tl.put_tombstone(key_range, lsn).await
+    pub(crate) async fn put_batch(
+        &self,
+        batch: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.tl.put_values(batch, ctx).await
+    }
+
+    pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        self.tl.put_tombstones(batch).await
     }
 
     /// Track the end of the latest digested WAL record.
@@ -4541,11 +4594,11 @@ impl<'a> TimelineWriter<'a> {
     /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for
     /// the 'lsn' or anything older. The previous last record LSN is stored alongside
     /// the latest and can be read.
-    pub fn finish_write(&self, new_lsn: Lsn) {
+    pub(crate) fn finish_write(&self, new_lsn: Lsn) {
         self.tl.finish_write(new_lsn);
     }
 
-    pub fn update_current_logical_size(&self, delta: i64) {
+    pub(crate) fn update_current_logical_size(&self, delta: i64) {
         self.tl.update_current_logical_size(delta)
     }
 }
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index e32265afb5..2fab6722b8 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -58,6 +58,7 @@ pub struct WalReceiverConf {
     pub max_lsn_wal_lag: NonZeroU64,
     pub auth_token: Option<Arc<String>>,
     pub availability_zone: Option<String>,
+    pub ingest_batch_size: u64,
 }
 
 pub struct WalReceiver {
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 5a5b3d7586..7fa5bb7689 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -411,6 +411,7 @@ impl ConnectionManagerState {
 
         let node_id = new_sk.safekeeper_id;
         let connect_timeout = self.conf.wal_connect_timeout;
+        let ingest_batch_size = self.conf.ingest_batch_size;
         let timeline = Arc::clone(&self.timeline);
         let ctx = ctx.detached_child(
             TaskKind::WalReceiverConnectionHandler,
@@ -430,6 +431,7 @@ impl ConnectionManagerState {
                     connect_timeout,
                     ctx,
                     node_id,
+                    ingest_batch_size,
                 )
                 .await;
 
@@ -1345,6 +1347,7 @@ mod tests {
                 max_lsn_wal_lag: NonZeroU64::new(1024 * 1024).unwrap(),
                 auth_token: None,
                 availability_zone: None,
+                ingest_batch_size: 1,
             },
             wal_connection: None,
             wal_stream_candidates: HashMap::new(),
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 61ab236322..e398d683e5 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
 use super::TaskStateUpdate;
 use crate::{
     context::RequestContext,
-    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS},
+    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
     task_mgr,
     task_mgr::TaskKind,
     task_mgr::WALRECEIVER_RUNTIME,
@@ -106,6 +106,7 @@ impl From<WalDecodeError> for WalReceiverError {
 
 /// Open a connection to the given safekeeper and receive WAL, sending back progress
 /// messages as we go.
+#[allow(clippy::too_many_arguments)]
 pub(super) async fn handle_walreceiver_connection(
     timeline: Arc<Timeline>,
     wal_source_connconf: PgConnectionConfig,
@@ -114,6 +115,7 @@ pub(super) async fn handle_walreceiver_connection(
     connect_timeout: Duration,
     ctx: RequestContext,
     node: NodeId,
+    ingest_batch_size: u64,
 ) -> Result<(), WalReceiverError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
@@ -305,7 +307,9 @@ pub(super) async fn handle_walreceiver_connection(
 
                 {
                     let mut decoded = DecodedWALRecord::default();
-                    let mut modification = timeline.begin_modification(endlsn);
+                    let mut modification = timeline.begin_modification(startlsn);
+                    let mut uncommitted_records = 0;
+                    let mut filtered_records = 0;
                     while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                         // It is important to deal with the aligned records as lsn in getPage@LSN is
                         // aligned and can be several bytes bigger. Without this alignment we are
@@ -314,14 +318,40 @@ pub(super) async fn handle_walreceiver_connection(
                             return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
                         }
 
-                        walingest
+                        // Ingest the records without immediately committing them.
+                        let ingested = walingest
                             .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
                             .await
                             .with_context(|| format!("could not ingest record at {lsn}"))?;
+                        if !ingested {
+                            tracing::debug!("ingest: filtered out record @ LSN {lsn}");
+                            WAL_INGEST.records_filtered.inc();
+                            filtered_records += 1;
+                        }
 
                         fail_point!("walreceiver-after-ingest");
 
                         last_rec_lsn = lsn;
+
+                        // Commit every ingest_batch_size records. Even if we filtered out
+                        // all records, we still need to call commit to advance the LSN.
+                        uncommitted_records += 1;
+                        if uncommitted_records >= ingest_batch_size {
+                            WAL_INGEST
+                                .records_committed
+                                .inc_by(uncommitted_records - filtered_records);
+                            modification.commit(&ctx).await?;
+                            uncommitted_records = 0;
+                            filtered_records = 0;
+                        }
+                    }
+
+                    // Commit the remaining records.
+                    if uncommitted_records > 0 {
+                        WAL_INGEST
+                            .records_committed
+                            .inc_by(uncommitted_records - filtered_records);
+                        modification.commit(&ctx).await?;
                     }
                 }
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index a6a8972970..8df0c81c7a 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -48,20 +48,18 @@ use postgres_ffi::TransactionId;
 use postgres_ffi::BLCKSZ;
 use utils::lsn::Lsn;
 
-pub struct WalIngest<'a> {
+pub struct WalIngest {
     shard: ShardIdentity,
-    timeline: &'a Timeline,
-
     checkpoint: CheckPoint,
     checkpoint_modified: bool,
 }
 
-impl<'a> WalIngest<'a> {
+impl WalIngest {
     pub async fn new(
-        timeline: &'a Timeline,
+        timeline: &Timeline,
         startpoint: Lsn,
-        ctx: &'_ RequestContext,
-    ) -> anyhow::Result<WalIngest<'a>> {
+        ctx: &RequestContext,
+    ) -> anyhow::Result<WalIngest> {
         // Fetch the latest checkpoint into memory, so that we can compare with it
         // quickly in `ingest_record` and update it when it changes.
         let checkpoint_bytes = timeline.get_checkpoint(startpoint, ctx).await?;
@@ -70,7 +68,6 @@ impl<'a> WalIngest<'a> {
 
         Ok(WalIngest {
             shard: *timeline.get_shard_identity(),
-            timeline,
             checkpoint,
             checkpoint_modified: false,
         })
@@ -84,6 +81,8 @@ impl<'a> WalIngest<'a> {
     /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
     /// relations/pages that the record affects.
     ///
+    /// This function returns `true` if the record was ingested, and `false` if it was filtered out
+    ///
     pub async fn ingest_record(
         &mut self,
         recdata: Bytes,
@@ -91,11 +90,13 @@ impl<'a> WalIngest<'a> {
         modification: &mut DatadirModification<'_>,
         decoded: &mut DecodedWALRecord,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> anyhow::Result<bool> {
         WAL_INGEST.records_received.inc();
+        let pg_version = modification.tline.pg_version;
+        let prev_len = modification.len();
 
-        modification.lsn = lsn;
-        decode_wal_record(recdata, decoded, self.timeline.pg_version)?;
+        modification.set_lsn(lsn)?;
+        decode_wal_record(recdata, decoded, pg_version)?;
 
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
@@ -132,9 +133,9 @@ impl<'a> WalIngest<'a> {
             }
             pg_constants::RM_DBASE_ID => {
                 let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-                debug!(%info, pg_version=%self.timeline.pg_version, "handle RM_DBASE_ID");
+                debug!(%info, %pg_version, "handle RM_DBASE_ID");
 
-                if self.timeline.pg_version == 14 {
+                if pg_version == 14 {
                     if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
                         let createdb = XlCreateDatabase::decode(&mut buf);
                         debug!("XLOG_DBASE_CREATE v14");
@@ -150,7 +151,7 @@ impl<'a> WalIngest<'a> {
                                 .await?;
                         }
                     }
-                } else if self.timeline.pg_version == 15 {
+                } else if pg_version == 15 {
                     if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
                         debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
                     } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
@@ -170,7 +171,7 @@ impl<'a> WalIngest<'a> {
                                 .await?;
                         }
                     }
-                } else if self.timeline.pg_version == 16 {
+                } else if pg_version == 16 {
                     if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
                         debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
                     } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
@@ -399,19 +400,11 @@ impl<'a> WalIngest<'a> {
             self.checkpoint_modified = false;
         }
 
-        if modification.is_empty() {
-            tracing::debug!("ingest: filtered out record @ LSN {lsn}");
-            WAL_INGEST.records_filtered.inc();
-            modification.tline.finish_write(lsn);
-        } else {
-            WAL_INGEST.records_committed.inc();
-            modification.commit(ctx).await?;
-        }
+        // Note that at this point this record is only cached in the modification
+        // until commit() is called to flush the data into the repository and update
+        // the latest LSN.
 
-        // Now that this record has been fully handled, including updating the
-        // checkpoint data, let the repository know that it is up-to-date to this LSN.
-
-        Ok(())
+        Ok(modification.len() > prev_len)
     }
 
     /// Do not store this block, but observe it for the purposes of updating our relation size state.
@@ -458,7 +451,7 @@ impl<'a> WalIngest<'a> {
             && (decoded.xl_info == pg_constants::XLOG_FPI
                 || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
             // compression of WAL is not yet supported: fall back to storing the original WAL record
-            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, self.timeline.pg_version)?
+            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)?
             // do not materialize null pages because them most likely be soon replaced with real data
             && blk.bimg_len != 0
         {
@@ -511,7 +504,7 @@ impl<'a> WalIngest<'a> {
         let mut old_heap_blkno: Option<u32> = None;
         let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
 
-        match self.timeline.pg_version {
+        match modification.tline.pg_version {
             14 => {
                 if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
                     let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
@@ -735,7 +728,7 @@ impl<'a> WalIngest<'a> {
             // replaying it would fail to find the previous image of the page, because
             // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
             // record if it doesn't.
-            let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
+            let vm_size = get_relsize(modification, vm_rel, ctx).await?;
             if let Some(blknum) = new_vm_blk {
                 if blknum >= vm_size {
                     new_vm_blk = None;
@@ -816,10 +809,11 @@ impl<'a> WalIngest<'a> {
         let mut new_heap_blkno: Option<u32> = None;
         let mut old_heap_blkno: Option<u32> = None;
         let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
+        let pg_version = modification.tline.pg_version;
 
         assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
 
-        match self.timeline.pg_version {
+        match pg_version {
             16 => {
                 let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
 
@@ -882,7 +876,7 @@ impl<'a> WalIngest<'a> {
             }
             _ => bail!(
                 "Neon RMGR has no known compatibility with PostgreSQL version {}",
-                self.timeline.pg_version
+                pg_version
             ),
         }
 
@@ -905,7 +899,7 @@ impl<'a> WalIngest<'a> {
             // replaying it would fail to find the previous image of the page, because
             // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
             // record if it doesn't.
-            let vm_size = self.get_relsize(vm_rel, modification.lsn, ctx).await?;
+            let vm_size = get_relsize(modification, vm_rel, ctx).await?;
             if let Some(blknum) = new_vm_blk {
                 if blknum >= vm_size {
                     new_vm_blk = None;
@@ -983,16 +977,14 @@ impl<'a> WalIngest<'a> {
         let src_db_id = rec.src_db_id;
         let src_tablespace_id = rec.src_tablespace_id;
 
-        // Creating a database is implemented by copying the template (aka. source) database.
-        // To copy all the relations, we need to ask for the state as of the same LSN, but we
-        // cannot pass 'lsn' to the Timeline.get_* functions, or they will block waiting for
-        // the last valid LSN to advance up to it. So we use the previous record's LSN in the
-        // get calls instead.
-        let req_lsn = modification.tline.get_last_record_lsn();
-
         let rels = modification
             .tline
-            .list_rels(src_tablespace_id, src_db_id, req_lsn, ctx)
+            .list_rels(
+                src_tablespace_id,
+                src_db_id,
+                Version::Modified(modification),
+                ctx,
+            )
             .await?;
 
         debug!("ingest_xlog_dbase_create: {} rels", rels.len());
@@ -1000,7 +992,12 @@ impl<'a> WalIngest<'a> {
         // Copy relfilemap
         let filemap = modification
             .tline
-            .get_relmap_file(src_tablespace_id, src_db_id, req_lsn, ctx)
+            .get_relmap_file(
+                src_tablespace_id,
+                src_db_id,
+                Version::Modified(modification),
+                ctx,
+            )
             .await?;
         modification
             .put_relmap_file(tablespace_id, db_id, filemap, ctx)
@@ -1014,7 +1011,7 @@ impl<'a> WalIngest<'a> {
 
             let nblocks = modification
                 .tline
-                .get_rel_size(src_rel, req_lsn, true, ctx)
+                .get_rel_size(src_rel, Version::Modified(modification), true, ctx)
                 .await?;
             let dst_rel = RelTag {
                 spcnode: tablespace_id,
@@ -1032,7 +1029,13 @@ impl<'a> WalIngest<'a> {
 
                 let content = modification
                     .tline
-                    .get_rel_page_at_lsn(src_rel, blknum, req_lsn, true, ctx)
+                    .get_rel_page_at_lsn(
+                        src_rel,
+                        blknum,
+                        Version::Modified(modification),
+                        true,
+                        ctx,
+                    )
                     .await?;
                 modification.put_rel_page_image(dst_rel, blknum, content)?;
                 num_blocks_copied += 1;
@@ -1103,7 +1106,7 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, fsm_physical_page_no, ZERO_PAGE.clone())?;
                 fsm_physical_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
+            let nblocks = get_relsize(modification, rel, ctx).await?;
             if nblocks > fsm_physical_page_no {
                 // check if something to do: FSM is larger than truncate position
                 self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
@@ -1125,7 +1128,7 @@ impl<'a> WalIngest<'a> {
                 modification.put_rel_page_image(rel, vm_page_no, ZERO_PAGE.clone())?;
                 vm_page_no += 1;
             }
-            let nblocks = self.get_relsize(rel, modification.lsn, ctx).await?;
+            let nblocks = get_relsize(modification, rel, ctx).await?;
             if nblocks > vm_page_no {
                 // check if something to do: VM is larger than truncate position
                 self.put_rel_truncation(modification, rel, vm_page_no, ctx)
@@ -1198,10 +1201,9 @@ impl<'a> WalIngest<'a> {
                     dbnode: xnode.dbnode,
                     relnode: xnode.relnode,
                 };
-                let last_lsn = self.timeline.get_last_record_lsn();
                 if modification
                     .tline
-                    .get_rel_exists(rel, last_lsn, true, ctx)
+                    .get_rel_exists(rel, Version::Modified(modification), true, ctx)
                     .await?
                 {
                     self.put_rel_drop(modification, rel, ctx).await?;
@@ -1255,10 +1257,9 @@ impl<'a> WalIngest<'a> {
         // will block waiting for the last valid LSN to advance up to
         // it. So we use the previous record's LSN in the get calls
         // instead.
-        let req_lsn = modification.tline.get_last_record_lsn();
         for segno in modification
             .tline
-            .list_slru_segments(SlruKind::Clog, req_lsn, ctx)
+            .list_slru_segments(SlruKind::Clog, Version::Modified(modification), ctx)
             .await?
         {
             let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -1470,20 +1471,6 @@ impl<'a> WalIngest<'a> {
         Ok(())
     }
 
-    async fn get_relsize(
-        &mut self,
-        rel: RelTag,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<BlockNumber> {
-        let nblocks = if !self.timeline.get_rel_exists(rel, lsn, true, ctx).await? {
-            0
-        } else {
-            self.timeline.get_rel_size(rel, lsn, true, ctx).await?
-        };
-        Ok(nblocks)
-    }
-
     async fn handle_rel_extend(
         &mut self,
         modification: &mut DatadirModification<'_>,
@@ -1495,7 +1482,6 @@ impl<'a> WalIngest<'a> {
         // Check if the relation exists. We implicitly create relations on first
         // record.
         // TODO: would be nice if to be more explicit about it
-        let last_lsn = modification.lsn;
 
         // Get current size and put rel creation if rel doesn't exist
         //
@@ -1503,11 +1489,14 @@ impl<'a> WalIngest<'a> {
         //       check the cache too. This is because eagerly checking the cache results in
         //       less work overall and 10% better performance. It's more work on cache miss
         //       but cache miss is rare.
-        let old_nblocks = if let Some(nblocks) = self.timeline.get_cached_rel_size(&rel, last_lsn) {
+        let old_nblocks = if let Some(nblocks) = modification
+            .tline
+            .get_cached_rel_size(&rel, modification.get_lsn())
+        {
             nblocks
-        } else if !self
-            .timeline
-            .get_rel_exists(rel, last_lsn, true, ctx)
+        } else if !modification
+            .tline
+            .get_rel_exists(rel, Version::Modified(modification), true, ctx)
             .await?
         {
             // create it with 0 size initially, the logic below will extend it
@@ -1517,7 +1506,10 @@ impl<'a> WalIngest<'a> {
                 .context("Relation Error")?;
             0
         } else {
-            self.timeline.get_rel_size(rel, last_lsn, true, ctx).await?
+            modification
+                .tline
+                .get_rel_size(rel, Version::Modified(modification), true, ctx)
+                .await?
         };
 
         if new_nblocks > old_nblocks {
@@ -1570,10 +1562,9 @@ impl<'a> WalIngest<'a> {
         // Check if the relation exists. We implicitly create relations on first
         // record.
         // TODO: would be nice if to be more explicit about it
-        let last_lsn = self.timeline.get_last_record_lsn();
-        let old_nblocks = if !self
-            .timeline
-            .get_slru_segment_exists(kind, segno, last_lsn, ctx)
+        let old_nblocks = if !modification
+            .tline
+            .get_slru_segment_exists(kind, segno, Version::Modified(modification), ctx)
             .await?
         {
             // create it with 0 size initially, the logic below will extend it
@@ -1582,8 +1573,9 @@ impl<'a> WalIngest<'a> {
                 .await?;
             0
         } else {
-            self.timeline
-                .get_slru_segment_size(kind, segno, last_lsn, ctx)
+            modification
+                .tline
+                .get_slru_segment_size(kind, segno, Version::Modified(modification), ctx)
                 .await?
         };
 
@@ -1606,6 +1598,26 @@ impl<'a> WalIngest<'a> {
     }
 }
 
+async fn get_relsize(
+    modification: &DatadirModification<'_>,
+    rel: RelTag,
+    ctx: &RequestContext,
+) -> anyhow::Result<BlockNumber> {
+    let nblocks = if !modification
+        .tline
+        .get_rel_exists(rel, Version::Modified(modification), true, ctx)
+        .await?
+    {
+        0
+    } else {
+        modification
+            .tline
+            .get_rel_size(rel, Version::Modified(modification), true, ctx)
+            .await?
+    };
+    Ok(nblocks)
+}
+
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
@@ -1632,10 +1644,7 @@ mod tests {
 
     static ZERO_CHECKPOINT: Bytes = Bytes::from_static(&[0u8; SIZEOF_CHECKPOINT]);
 
-    async fn init_walingest_test<'a>(
-        tline: &'a Timeline,
-        ctx: &RequestContext,
-    ) -> Result<WalIngest<'a>> {
+    async fn init_walingest_test(tline: &Timeline, ctx: &RequestContext) -> Result<WalIngest> {
         let mut m = tline.begin_modification(Lsn(0x10));
         m.put_checkpoint(ZERO_CHECKPOINT.clone())?;
         m.put_relmap_file(0, 111, Bytes::from(""), ctx).await?; // dummy relmapper file
@@ -1680,29 +1689,29 @@ mod tests {
         // The relation was created at LSN 2, not visible at LSN 1 yet.
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
                 .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
+            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
             .await
             .is_err());
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             1
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             3
         );
@@ -1710,46 +1719,46 @@ mod tests {
         // Check page contents at each LSN
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x20), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 2")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x30), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x40), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x40), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
 
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x50), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x50), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 2 at 5")
         );
@@ -1765,19 +1774,19 @@ mod tests {
         // Check reported size and contents after truncation
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
                 .await?,
             2
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x60), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 0 at 3")
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x60), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1 at 4")
         );
@@ -1785,13 +1794,13 @@ mod tests {
         // should still see the truncated block with older LSN
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             3
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 2, Lsn(0x50), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 2 at 5")
         );
@@ -1804,7 +1813,7 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x68), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x68)), false, &ctx)
                 .await?,
             0
         );
@@ -1817,19 +1826,19 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x70), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x70)), false, &ctx)
                 .await?,
             2
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 0, Lsn(0x70), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x70)), false, &ctx)
                 .await?,
             ZERO_PAGE
         );
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1, Lsn(0x70), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1")
         );
@@ -1842,21 +1851,21 @@ mod tests {
         m.commit(&ctx).await?;
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
                 .await?,
             1501
         );
         for blk in 2..1500 {
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blk, Lsn(0x80), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blk, Version::Lsn(Lsn(0x80)), false, &ctx)
                     .await?,
                 ZERO_PAGE
             );
         }
         assert_eq!(
             tline
-                .get_rel_page_at_lsn(TESTREL_A, 1500, Lsn(0x80), false, &ctx)
+                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
                 .await?,
             TEST_IMG("foo blk 1500")
         );
@@ -1883,13 +1892,13 @@ mod tests {
         // Check that rel exists and size is correct
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             1
         );
@@ -1902,7 +1911,7 @@ mod tests {
         // Check that rel is not visible anymore
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x30), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x30)), false, &ctx)
                 .await?,
             false
         );
@@ -1920,13 +1929,13 @@ mod tests {
         // Check that rel exists and size is correct
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x40), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x40), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x40)), false, &ctx)
                 .await?,
             1
         );
@@ -1959,24 +1968,24 @@ mod tests {
         // The relation was created at LSN 20, not visible at LSN 1 yet.
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x10), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
                 .await?,
             false
         );
         assert!(tline
-            .get_rel_size(TESTREL_A, Lsn(0x10), false, &ctx)
+            .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x10)), false, &ctx)
             .await
             .is_err());
 
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x20), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x20), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x20)), false, &ctx)
                 .await?,
             relsize
         );
@@ -1987,7 +1996,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, lsn, false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -2004,7 +2013,7 @@ mod tests {
         // Check reported size and contents after truncation
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x60), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x60)), false, &ctx)
                 .await?,
             1
         );
@@ -2014,7 +2023,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x60), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -2023,7 +2032,7 @@ mod tests {
         // should still see all blocks with older LSN
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x50), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x50)), false, &ctx)
                 .await?,
             relsize
         );
@@ -2032,7 +2041,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x50), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -2052,13 +2061,13 @@ mod tests {
 
         assert_eq!(
             tline
-                .get_rel_exists(TESTREL_A, Lsn(0x80), false, &ctx)
+                .get_rel_exists(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
                 .await?,
             true
         );
         assert_eq!(
             tline
-                .get_rel_size(TESTREL_A, Lsn(0x80), false, &ctx)
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(0x80)), false, &ctx)
                 .await?,
             relsize
         );
@@ -2068,7 +2077,7 @@ mod tests {
             let data = format!("foo blk {} at {}", blkno, lsn);
             assert_eq!(
                 tline
-                    .get_rel_page_at_lsn(TESTREL_A, blkno, Lsn(0x80), false, &ctx)
+                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
                     .await?,
                 TEST_IMG(&data)
             );
@@ -2101,7 +2110,9 @@ mod tests {
         assert_current_logical_size(&tline, Lsn(lsn));
 
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
+            tline
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                .await?,
             RELSEG_SIZE + 1
         );
 
@@ -2113,7 +2124,9 @@ mod tests {
             .await?;
         m.commit(&ctx).await?;
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
+            tline
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                .await?,
             RELSEG_SIZE
         );
         assert_current_logical_size(&tline, Lsn(lsn));
@@ -2126,7 +2139,9 @@ mod tests {
             .await?;
         m.commit(&ctx).await?;
         assert_eq!(
-            tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
+            tline
+                .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                .await?,
             RELSEG_SIZE - 1
         );
         assert_current_logical_size(&tline, Lsn(lsn));
@@ -2142,7 +2157,9 @@ mod tests {
                 .await?;
             m.commit(&ctx).await?;
             assert_eq!(
-                tline.get_rel_size(TESTREL_A, Lsn(lsn), false, &ctx).await?,
+                tline
+                    .get_rel_size(TESTREL_A, Version::Lsn(Lsn(lsn)), false, &ctx)
+                    .await?,
                 size as BlockNumber
             );
 
@@ -2179,7 +2196,7 @@ mod tests {
         let wal_segment_path = format!("{path}/000000010000000000000001.zst");
         let source_initdb_path = format!("{path}/{INITDB_PATH}");
         let startpoint = Lsn::from_hex("14AEC08").unwrap();
-        let endpoint = Lsn::from_hex("1FFFF98").unwrap();
+        let _endpoint = Lsn::from_hex("1FFFF98").unwrap();
 
         let harness = TenantHarness::create("test_ingest_real_wal").unwrap();
         let (tenant, ctx) = harness.load().await;
@@ -2221,7 +2238,7 @@ mod tests {
         let mut walingest = WalIngest::new(tline.as_ref(), startpoint, &ctx)
             .await
             .unwrap();
-        let mut modification = tline.begin_modification(endpoint);
+        let mut modification = tline.begin_modification(startpoint);
         let mut decoded = DecodedWALRecord::default();
         println!("decoding {} bytes", bytes.len() - xlogoff);
 
@@ -2235,6 +2252,7 @@ mod tests {
                     .await
                     .unwrap();
             }
+            modification.commit(&ctx).await.unwrap();
         }
 
         let duration = started_at.elapsed();
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index 94e95fd3b3..6918698f29 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,6 +22,7 @@ use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::{BufMut, Bytes, BytesMut};
 use nix::poll::*;
+use pageserver_api::shard::TenantShardId;
 use serde::Serialize;
 use std::collections::VecDeque;
 use std::io;
@@ -35,14 +36,11 @@ use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::Duration;
 use std::time::Instant;
 use tracing::*;
-use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};
+use utils::{bin_ser::BeSer, lsn::Lsn, nonblock::set_nonblock};
 
 #[cfg(feature = "testing")]
 use std::sync::atomic::{AtomicUsize, Ordering};
 
-#[cfg(feature = "testing")]
-use pageserver_api::shard::TenantShardId;
-
 use crate::config::PageServerConf;
 use crate::metrics::{
     WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
@@ -92,7 +90,7 @@ struct ProcessOutput {
 /// records.
 ///
 pub struct PostgresRedoManager {
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
     last_redo_at: std::sync::Mutex<Option<Instant>>,
     redo_process: RwLock<Option<Arc<WalRedoProcess>>>,
@@ -186,10 +184,13 @@ impl PostgresRedoManager {
     ///
     /// Create a new PostgresRedoManager.
     ///
-    pub fn new(conf: &'static PageServerConf, tenant_id: TenantId) -> PostgresRedoManager {
+    pub fn new(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+    ) -> PostgresRedoManager {
         // The actual process is launched lazily, on first request.
         PostgresRedoManager {
-            tenant_id,
+            tenant_shard_id,
             conf,
             last_redo_at: std::sync::Mutex::default(),
             redo_process: RwLock::new(None),
@@ -244,8 +245,12 @@ impl PostgresRedoManager {
                                 let timer =
                                     WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
                                 let proc = Arc::new(
-                                    WalRedoProcess::launch(self.conf, self.tenant_id, pg_version)
-                                        .context("launch walredo process")?,
+                                    WalRedoProcess::launch(
+                                        self.conf,
+                                        self.tenant_shard_id,
+                                        pg_version,
+                                    )
+                                    .context("launch walredo process")?,
                                 );
                                 timer.observe_duration();
                                 *proc_guard = Some(Arc::clone(&proc));
@@ -638,7 +643,7 @@ impl<C: CommandExt> CloseFileDescriptors for C {
 struct WalRedoProcess {
     #[allow(dead_code)]
     conf: &'static PageServerConf,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
     // Some() on construction, only becomes None on Drop.
     child: Option<NoLeakChild>,
     stdout: Mutex<ProcessOutput>,
@@ -652,10 +657,10 @@ impl WalRedoProcess {
     //
     // Start postgres binary in special WAL redo mode.
     //
-    #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
+    #[instrument(skip_all,fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), pg_version=pg_version))]
     fn launch(
         conf: &'static PageServerConf,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
         pg_version: u32,
     ) -> anyhow::Result<Self> {
         let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
@@ -680,7 +685,7 @@ impl WalRedoProcess {
             // as close-on-exec by default, but that's not enough, since we use
             // libraries that directly call libc open without setting that flag.
             .close_fds()
-            .spawn_no_leak_child(tenant_id)
+            .spawn_no_leak_child(tenant_shard_id)
             .context("spawn process")?;
         WAL_REDO_PROCESS_COUNTERS.started.inc();
         let mut child = scopeguard::guard(child, |child| {
@@ -741,12 +746,12 @@ impl WalRedoProcess {
                         error!(error=?e, "failed to read from walredo stderr");
                     }
                 }
-            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
         );
 
         Ok(Self {
             conf,
-            tenant_id,
+            tenant_shard_id,
             child: Some(child),
             stdin: Mutex::new(ProcessInput {
                 stdin,
@@ -772,7 +777,7 @@ impl WalRedoProcess {
     // Apply given WAL records ('records') over an old page image. Returns
     // new page image.
     //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.id()))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
     fn apply_wal_records(
         &self,
         tag: BufferTag,
@@ -966,11 +971,7 @@ impl WalRedoProcess {
         // these files will be collected to an allure report
         let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
 
-        // TODO(sharding): update this call when WalRedoProcess gets a TenantShardId.
-        let path = self
-            .conf
-            .tenant_path(&TenantShardId::unsharded(self.tenant_id))
-            .join(&filename);
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
 
         let res = std::fs::OpenOptions::new()
             .write(true)
@@ -1004,7 +1005,7 @@ impl Drop for WalRedoProcess {
 /// Wrapper type around `std::process::Child` which guarantees that the child
 /// will be killed and waited-for by this process before being dropped.
 struct NoLeakChild {
-    tenant_id: TenantId,
+    tenant_id: TenantShardId,
     child: Option<Child>,
 }
 
@@ -1023,7 +1024,7 @@ impl DerefMut for NoLeakChild {
 }
 
 impl NoLeakChild {
-    fn spawn(tenant_id: TenantId, command: &mut Command) -> io::Result<Self> {
+    fn spawn(tenant_id: TenantShardId, command: &mut Command) -> io::Result<Self> {
         let child = command.spawn()?;
         Ok(NoLeakChild {
             tenant_id,
@@ -1078,7 +1079,7 @@ impl Drop for NoLeakChild {
             Some(child) => child,
             None => return,
         };
-        let tenant_id = self.tenant_id;
+        let tenant_shard_id = self.tenant_id;
         // Offload the kill+wait of the child process into the background.
         // If someone stops the runtime, we'll leak the child process.
         // We can ignore that case because we only stop the runtime on pageserver exit.
@@ -1086,7 +1087,11 @@ impl Drop for NoLeakChild {
             tokio::task::spawn_blocking(move || {
                 // Intentionally don't inherit the tracing context from whoever is dropping us.
                 // This thread here is going to outlive of our dropper.
-                let span = tracing::info_span!("walredo", %tenant_id);
+                let span = tracing::info_span!(
+                    "walredo",
+                    tenant_id = %tenant_shard_id.tenant_id,
+                    shard_id = %tenant_shard_id.shard_slug()
+                );
                 let _entered = span.enter();
                 Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
             })
@@ -1096,11 +1101,11 @@ impl Drop for NoLeakChild {
 }
 
 trait NoLeakChildCommandExt {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild>;
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild>;
 }
 
 impl NoLeakChildCommandExt for Command {
-    fn spawn_no_leak_child(&mut self, tenant_id: TenantId) -> io::Result<NoLeakChild> {
+    fn spawn_no_leak_child(&mut self, tenant_id: TenantShardId) -> io::Result<NoLeakChild> {
         NoLeakChild::spawn(tenant_id, self)
     }
 }
@@ -1155,6 +1160,7 @@ mod tests {
     use crate::repository::Key;
     use crate::{config::PageServerConf, walrecord::NeonWalRecord};
     use bytes::Bytes;
+    use pageserver_api::shard::TenantShardId;
     use std::str::FromStr;
     use utils::{id::TenantId, lsn::Lsn};
 
@@ -1264,9 +1270,9 @@ mod tests {
             let repo_dir = camino_tempfile::tempdir()?;
             let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
             let conf = Box::leak(Box::new(conf));
-            let tenant_id = TenantId::generate();
+            let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
 
-            let manager = PostgresRedoManager::new(conf, tenant_id);
+            let manager = PostgresRedoManager::new(conf, tenant_shard_id);
 
             Ok(RedoHarness {
                 _repo_dir: repo_dir,
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 3b038f906f..3a7c0f1bb6 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -35,7 +35,8 @@
 
 #define PageStoreTrace DEBUG5
 
-#define RECONNECT_INTERVAL_USEC 1000000
+#define MIN_RECONNECT_INTERVAL_USEC 1000
+#define MAX_RECONNECT_INTERVAL_USEC 1000000
 
 bool		connected = false;
 PGconn	   *pageserver_conn = NULL;
@@ -133,6 +134,11 @@ pageserver_connect(int elevel)
 	const char *values[3];
 	int			n;
 
+	static TimestampTz last_connect_time = 0;
+	static uint64_t delay_us = MIN_RECONNECT_INTERVAL_USEC;
+	TimestampTz now;
+        uint64_t us_since_last_connect;
+
 	Assert(!connected);
 
 	if (CheckConnstringUpdated())
@@ -140,6 +146,22 @@ pageserver_connect(int elevel)
 		ReloadConnstring();
 	}
 
+	now = GetCurrentTimestamp();
+        us_since_last_connect = now - last_connect_time;
+	if (us_since_last_connect < delay_us)
+	{
+		pg_usleep(delay_us - us_since_last_connect);
+		delay_us *= 2;
+		if (delay_us > MAX_RECONNECT_INTERVAL_USEC)
+			delay_us = MAX_RECONNECT_INTERVAL_USEC;
+		last_connect_time = GetCurrentTimestamp();
+	}
+	else
+	{
+		delay_us = MIN_RECONNECT_INTERVAL_USEC;
+		last_connect_time = now;
+	}
+
 	/*
 	 * Connect using the connection string we got from the
 	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
@@ -333,7 +355,6 @@ pageserver_send(NeonRequest *request)
 		{
 			HandleMainLoopInterrupts();
 			n_reconnect_attempts += 1;
-			pg_usleep(RECONNECT_INTERVAL_USEC);
 		}
 		n_reconnect_attempts = 0;
 	}
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 7fb0cab9a0..1f7c473e7d 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -99,7 +99,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 		port = strchr(host, ':');
 		if (port == NULL)
 		{
-			walprop_log(FATAL, "port is not specified");
+			wp_log(FATAL, "port is not specified");
 		}
 		*port++ = '\0';
 		sep = strchr(port, ',');
@@ -107,7 +107,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 			*sep++ = '\0';
 		if (wp->n_safekeepers + 1 >= MAX_SAFEKEEPERS)
 		{
-			walprop_log(FATAL, "Too many safekeepers");
+			wp_log(FATAL, "too many safekeepers");
 		}
 		wp->safekeeper[wp->n_safekeepers].host = host;
 		wp->safekeeper[wp->n_safekeepers].port = port;
@@ -123,7 +123,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 							   "host=%s port=%s dbname=replication options='-c timeline_id=%s tenant_id=%s'",
 							   sk->host, sk->port, wp->config->neon_timeline, wp->config->neon_tenant);
 			if (written > MAXCONNINFO || written < 0)
-				walprop_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
+				wp_log(FATAL, "could not create connection string for safekeeper %s:%s", sk->host, sk->port);
 		}
 
 		initStringInfo(&wp->safekeeper[wp->n_safekeepers].outbuf);
@@ -133,7 +133,7 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	}
 	if (wp->n_safekeepers < 1)
 	{
-		walprop_log(FATAL, "Safekeepers addresses are not specified");
+		wp_log(FATAL, "safekeepers addresses are not specified");
 	}
 	wp->quorum = wp->n_safekeepers / 2 + 1;
 
@@ -144,15 +144,15 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	wp->api.strong_random(wp, &wp->greetRequest.proposerId, sizeof(wp->greetRequest.proposerId));
 	wp->greetRequest.systemId = wp->config->systemId;
 	if (!wp->config->neon_timeline)
-		walprop_log(FATAL, "neon.timeline_id is not provided");
+		wp_log(FATAL, "neon.timeline_id is not provided");
 	if (*wp->config->neon_timeline != '\0' &&
 		!HexDecodeString(wp->greetRequest.timeline_id, wp->config->neon_timeline, 16))
-		walprop_log(FATAL, "Could not parse neon.timeline_id, %s", wp->config->neon_timeline);
+		wp_log(FATAL, "could not parse neon.timeline_id, %s", wp->config->neon_timeline);
 	if (!wp->config->neon_tenant)
-		walprop_log(FATAL, "neon.tenant_id is not provided");
+		wp_log(FATAL, "neon.tenant_id is not provided");
 	if (*wp->config->neon_tenant != '\0' &&
 		!HexDecodeString(wp->greetRequest.tenant_id, wp->config->neon_tenant, 16))
-		walprop_log(FATAL, "Could not parse neon.tenant_id, %s", wp->config->neon_tenant);
+		wp_log(FATAL, "could not parse neon.tenant_id, %s", wp->config->neon_tenant);
 
 	wp->greetRequest.timeline = wp->config->pgTimeline;
 	wp->greetRequest.walSegSize = wp->config->wal_segment_size;
@@ -274,8 +274,8 @@ WalProposerPoll(WalProposer *wp)
 				if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
 											   wp->config->safekeeper_connection_timeout))
 				{
-					walprop_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
-								sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout);
+					wp_log(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
+						   sk->host, sk->port, FormatSafekeeperState(sk), wp->config->safekeeper_connection_timeout);
 					ShutdownConnection(sk);
 				}
 			}
@@ -356,8 +356,8 @@ ResetConnection(Safekeeper *sk)
 		 *
 		 * https://www.postgresql.org/docs/devel/libpq-connect.html#LIBPQ-PQCONNECTSTARTPARAMS
 		 */
-		walprop_log(WARNING, "Immediate failure to connect with node '%s:%s':\n\terror: %s",
-					sk->host, sk->port, wp->api.conn_error_message(sk));
+		wp_log(WARNING, "immediate failure to connect with node '%s:%s':\n\terror: %s",
+			   sk->host, sk->port, wp->api.conn_error_message(sk));
 
 		/*
 		 * Even though the connection failed, we still need to clean up the
@@ -380,7 +380,7 @@ ResetConnection(Safekeeper *sk)
 	 * (see libpqrcv_connect, defined in
 	 * src/backend/replication/libpqwalreceiver/libpqwalreceiver.c)
 	 */
-	walprop_log(LOG, "connecting with node %s:%s", sk->host, sk->port);
+	wp_log(LOG, "connecting with node %s:%s", sk->host, sk->port);
 
 	sk->state = SS_CONNECTING_WRITE;
 	sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
@@ -434,7 +434,7 @@ ReconnectSafekeepers(WalProposer *wp)
 static void
 AdvancePollState(Safekeeper *sk, uint32 events)
 {
-#ifdef WALPROPOSER_LIB			/* walprop_log needs wp in lib build */
+#ifdef WALPROPOSER_LIB			/* wp_log needs wp in lib build */
 	WalProposer *wp = sk->wp;
 #endif
 
@@ -452,8 +452,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 * ResetConnection
 			 */
 		case SS_OFFLINE:
-			walprop_log(FATAL, "Unexpected safekeeper %s:%s state advancement: is offline",
-						sk->host, sk->port);
+			wp_log(FATAL, "unexpected safekeeper %s:%s state advancement: is offline",
+				   sk->host, sk->port);
 			break;				/* actually unreachable, but prevents
 								 * -Wimplicit-fallthrough */
 
@@ -488,8 +488,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 * requests.
 			 */
 		case SS_VOTING:
-			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-						sk->port, FormatSafekeeperState(sk));
+			wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
+				   sk->port, FormatSafekeeperState(sk));
 			ResetConnection(sk);
 			return;
 
@@ -517,8 +517,8 @@ AdvancePollState(Safekeeper *sk, uint32 events)
 			 * Idle state for waiting votes from quorum.
 			 */
 		case SS_IDLE:
-			walprop_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
-						sk->port, FormatSafekeeperState(sk));
+			wp_log(WARNING, "EOF from node %s:%s in %s state", sk->host,
+				   sk->port, FormatSafekeeperState(sk));
 			ResetConnection(sk);
 			return;
 
@@ -543,8 +543,8 @@ HandleConnectionEvent(Safekeeper *sk)
 	switch (result)
 	{
 		case WP_CONN_POLLING_OK:
-			walprop_log(LOG, "connected with node %s:%s", sk->host,
-						sk->port);
+			wp_log(LOG, "connected with node %s:%s", sk->host,
+				   sk->port);
 			sk->latestMsgReceivedAt = wp->api.get_current_timestamp(wp);
 
 			/*
@@ -567,8 +567,8 @@ HandleConnectionEvent(Safekeeper *sk)
 			break;
 
 		case WP_CONN_POLLING_FAILED:
-			walprop_log(WARNING, "failed to connect to node '%s:%s': %s",
-						sk->host, sk->port, wp->api.conn_error_message(sk));
+			wp_log(WARNING, "failed to connect to node '%s:%s': %s",
+				   sk->host, sk->port, wp->api.conn_error_message(sk));
 
 			/*
 			 * If connecting failed, we don't want to restart the connection
@@ -604,8 +604,8 @@ SendStartWALPush(Safekeeper *sk)
 
 	if (!wp->api.conn_send_query(sk, "START_WAL_PUSH"))
 	{
-		walprop_log(WARNING, "Failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
-					sk->host, sk->port, wp->api.conn_error_message(sk));
+		wp_log(WARNING, "failed to send 'START_WAL_PUSH' query to safekeeper %s:%s: %s",
+			   sk->host, sk->port, wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return;
 	}
@@ -641,8 +641,8 @@ RecvStartWALPushResult(Safekeeper *sk)
 			break;
 
 		case WP_EXEC_FAILED:
-			walprop_log(WARNING, "Failed to send query to safekeeper %s:%s: %s",
-						sk->host, sk->port, wp->api.conn_error_message(sk));
+			wp_log(WARNING, "failed to send query to safekeeper %s:%s: %s",
+				   sk->host, sk->port, wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return;
 
@@ -652,8 +652,8 @@ RecvStartWALPushResult(Safekeeper *sk)
 			 * wrong"
 			 */
 		case WP_EXEC_UNEXPECTED_SUCCESS:
-			walprop_log(WARNING, "Received bad response from safekeeper %s:%s query execution",
-						sk->host, sk->port);
+			wp_log(WARNING, "received bad response from safekeeper %s:%s query execution",
+				   sk->host, sk->port);
 			ShutdownConnection(sk);
 			return;
 	}
@@ -688,7 +688,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->greetResponse))
 		return;
 
-	walprop_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
+	wp_log(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
 
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;
@@ -708,7 +708,7 @@ RecvAcceptorGreeting(Safekeeper *sk)
 		if (wp->n_connected == wp->quorum)
 		{
 			wp->propTerm++;
-			walprop_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
+			wp_log(LOG, "proposer connected to quorum (%d) safekeepers, propTerm=" INT64_FORMAT, wp->quorum, wp->propTerm);
 
 			wp->voteRequest = (VoteRequest)
 			{
@@ -721,9 +721,9 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	else if (sk->greetResponse.term > wp->propTerm)
 	{
 		/* Another compute with higher term is running. */
-		walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-					sk->host, sk->port,
-					sk->greetResponse.term, wp->propTerm);
+		wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+			   sk->host, sk->port,
+			   sk->greetResponse.term, wp->propTerm);
 	}
 
 	/*
@@ -763,7 +763,7 @@ SendVoteRequest(Safekeeper *sk)
 	WalProposer *wp = sk->wp;
 
 	/* We have quorum for voting, send our vote request */
-	walprop_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
+	wp_log(LOG, "requesting vote from %s:%s for term " UINT64_FORMAT, sk->host, sk->port, wp->voteRequest.term);
 	/* On failure, logging & resetting is handled */
 	if (!BlockingWrite(sk, &wp->voteRequest, sizeof(wp->voteRequest), SS_WAIT_VERDICT))
 		return;
@@ -780,12 +780,12 @@ RecvVoteResponse(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->voteResponse))
 		return;
 
-	walprop_log(LOG,
-				"got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
-				sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
-				LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
-				LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
-				LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
+	wp_log(LOG,
+		   "got VoteResponse from acceptor %s:%s, voteGiven=" UINT64_FORMAT ", epoch=" UINT64_FORMAT ", flushLsn=%X/%X, truncateLsn=%X/%X, timelineStartLsn=%X/%X",
+		   sk->host, sk->port, sk->voteResponse.voteGiven, GetHighestTerm(&sk->voteResponse.termHistory),
+		   LSN_FORMAT_ARGS(sk->voteResponse.flushLsn),
+		   LSN_FORMAT_ARGS(sk->voteResponse.truncateLsn),
+		   LSN_FORMAT_ARGS(sk->voteResponse.timelineStartLsn));
 
 	/*
 	 * In case of acceptor rejecting our vote, bail out, but only if either it
@@ -795,9 +795,9 @@ RecvVoteResponse(Safekeeper *sk)
 	if ((!sk->voteResponse.voteGiven) &&
 		(sk->voteResponse.term > wp->propTerm || wp->n_votes < wp->quorum))
 	{
-		walprop_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
-					sk->host, sk->port,
-					sk->voteResponse.term, wp->propTerm);
+		wp_log(FATAL, "WAL acceptor %s:%s with term " INT64_FORMAT " rejects our connection request with term " INT64_FORMAT "",
+			   sk->host, sk->port,
+			   sk->voteResponse.term, wp->propTerm);
 	}
 	Assert(sk->voteResponse.term == wp->propTerm);
 
@@ -841,7 +841,7 @@ HandleElectedProposer(WalProposer *wp)
 	 */
 	if (!wp->api.recovery_download(wp, &wp->safekeeper[wp->donor]))
 	{
-		walprop_log(FATAL, "failed to download WAL for logical replicaiton");
+		wp_log(FATAL, "failed to download WAL for logical replicaiton");
 	}
 
 	if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers)
@@ -948,10 +948,10 @@ DetermineEpochStartLsn(WalProposer *wp)
 				if (wp->timelineStartLsn != InvalidXLogRecPtr &&
 					wp->timelineStartLsn != wp->safekeeper[i].voteResponse.timelineStartLsn)
 				{
-					walprop_log(WARNING,
-								"inconsistent timelineStartLsn: current %X/%X, received %X/%X",
-								LSN_FORMAT_ARGS(wp->timelineStartLsn),
-								LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
+					wp_log(WARNING,
+						   "inconsistent timelineStartLsn: current %X/%X, received %X/%X",
+						   LSN_FORMAT_ARGS(wp->timelineStartLsn),
+						   LSN_FORMAT_ARGS(wp->safekeeper[i].voteResponse.timelineStartLsn));
 				}
 				wp->timelineStartLsn = wp->safekeeper[i].voteResponse.timelineStartLsn;
 			}
@@ -969,7 +969,7 @@ DetermineEpochStartLsn(WalProposer *wp)
 		{
 			wp->timelineStartLsn = wp->api.get_redo_start_lsn(wp);
 		}
-		walprop_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
+		wp_log(LOG, "bumped epochStartLsn to the first record %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn));
 	}
 
 	/*
@@ -996,12 +996,12 @@ DetermineEpochStartLsn(WalProposer *wp)
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].term = wp->propTerm;
 	wp->propTermHistory.entries[wp->propTermHistory.n_entries - 1].lsn = wp->propEpochStartLsn;
 
-	walprop_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
-				wp->quorum,
-				wp->propTerm,
-				LSN_FORMAT_ARGS(wp->propEpochStartLsn),
-				wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
-				LSN_FORMAT_ARGS(wp->truncateLsn));
+	wp_log(LOG, "got votes from majority (%d) of nodes, term " UINT64_FORMAT ", epochStartLsn %X/%X, donor %s:%s, truncate_lsn %X/%X",
+		   wp->quorum,
+		   wp->propTerm,
+		   LSN_FORMAT_ARGS(wp->propEpochStartLsn),
+		   wp->safekeeper[wp->donor].host, wp->safekeeper[wp->donor].port,
+		   LSN_FORMAT_ARGS(wp->truncateLsn));
 
 	/*
 	 * Ensure the basebackup we are running (at RedoStartLsn) matches LSN
@@ -1034,10 +1034,10 @@ DetermineEpochStartLsn(WalProposer *wp)
 				 * scenario.
 				 */
 				disable_core_dump();
-				walprop_log(PANIC,
-							"collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
-							LSN_FORMAT_ARGS(wp->propEpochStartLsn),
-							LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
+				wp_log(PANIC,
+					   "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X",
+					   LSN_FORMAT_ARGS(wp->propEpochStartLsn),
+					   LSN_FORMAT_ARGS(wp->api.get_redo_start_lsn(wp)));
 			}
 		}
 		walprop_shared->mineLastElectedTerm = wp->propTerm;
@@ -1091,34 +1091,10 @@ SendProposerElected(Safekeeper *sk)
 	{
 		/* safekeeper is empty or no common point, start from the beginning */
 		sk->startStreamingAt = wp->propTermHistory.entries[0].lsn;
-
-		if (sk->startStreamingAt < wp->truncateLsn)
-		{
-			/*
-			 * There's a gap between the WAL starting point and a truncateLsn,
-			 * which can't appear in a normal working cluster. That gap means
-			 * that all safekeepers reported that they have persisted WAL up
-			 * to the truncateLsn before, but now current safekeeper tells
-			 * otherwise.
-			 *
-			 * Also we have a special condition here, which is empty
-			 * safekeeper with no history. In combination with a gap, that can
-			 * happen when we introduce a new safekeeper to the cluster. This
-			 * is a rare case, which is triggered manually for now, and should
-			 * be treated with care.
-			 */
-
-			/*
-			 * truncateLsn will not change without ack from current
-			 * safekeeper, and it's aligned to the WAL record, so we can
-			 * safely start streaming from this point.
-			 */
-			sk->startStreamingAt = wp->truncateLsn;
-
-			walprop_log(WARNING, "empty safekeeper joined cluster as %s:%s, historyStart=%X/%X, sk->startStreamingAt=%X/%X",
-						sk->host, sk->port, LSN_FORMAT_ARGS(wp->propTermHistory.entries[0].lsn),
-						LSN_FORMAT_ARGS(sk->startStreamingAt));
-		}
+		wp_log(LOG, "no common point with sk %s:%s, streaming since first term at %X/%X, timelineStartLsn=%X/%X, termHistory.n_entries=%u" ,
+		 	 sk->host, sk->port, LSN_FORMAT_ARGS(sk->startStreamingAt), LSN_FORMAT_ARGS(wp->timelineStartLsn), wp->propTermHistory.n_entries);
+		/* wp->timelineStartLsn == InvalidXLogRecPtr can be only when timeline is created manually (test_s3_wal_replay) */
+		Assert(sk->startStreamingAt == wp->timelineStartLsn || wp->timelineStartLsn == InvalidXLogRecPtr);
 	}
 	else
 	{
@@ -1141,7 +1117,7 @@ SendProposerElected(Safekeeper *sk)
 		}
 	}
 
-	Assert(sk->startStreamingAt >= wp->truncateLsn && sk->startStreamingAt <= wp->availableLsn);
+	Assert(sk->startStreamingAt <= wp->availableLsn);
 
 	msg.tag = 'e';
 	msg.term = wp->propTerm;
@@ -1150,9 +1126,9 @@ SendProposerElected(Safekeeper *sk)
 	msg.timelineStartLsn = wp->timelineStartLsn;
 
 	lastCommonTerm = i >= 0 ? wp->propTermHistory.entries[i].term : 0;
-	walprop_log(LOG,
-				"sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
-				sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
+	wp_log(LOG,
+		   "sending elected msg to node " UINT64_FORMAT " term=" UINT64_FORMAT ", startStreamingAt=%X/%X (lastCommonTerm=" UINT64_FORMAT "), termHistory.n_entries=%u to %s:%s, timelineStartLsn=%X/%X",
+		   sk->greetResponse.nodeId, msg.term, LSN_FORMAT_ARGS(msg.startStreamingAt), lastCommonTerm, msg.termHistory->n_entries, sk->host, sk->port, LSN_FORMAT_ARGS(msg.timelineStartLsn));
 
 	resetStringInfo(&sk->outbuf);
 	pq_sendint64_le(&sk->outbuf, msg.tag);
@@ -1261,8 +1237,8 @@ HandleActiveState(Safekeeper *sk, uint32 events)
 	/* expected never to happen, c.f. walprop_pg_active_state_update_event_set */
 	if (events & WL_SOCKET_CLOSED)
 	{
-		walprop_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
-					sk->host, sk->port);
+		wp_log(WARNING, "connection to %s:%s in active state failed, got WL_SOCKET_CLOSED on neon_walreader socket",
+			   sk->host, sk->port);
 		ShutdownConnection(sk);
 		return;
 	}
@@ -1323,12 +1299,12 @@ SendAppendRequests(Safekeeper *sk)
 			req = &sk->appendRequest;
 			PrepareAppendRequest(sk->wp, &sk->appendRequest, sk->streamingAt, endLsn);
 
-			walprop_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
-						req->endLsn - req->beginLsn,
-						LSN_FORMAT_ARGS(req->beginLsn),
-						LSN_FORMAT_ARGS(req->endLsn),
-						LSN_FORMAT_ARGS(req->commitLsn),
-						LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
+			wp_log(DEBUG5, "sending message len %ld beginLsn=%X/%X endLsn=%X/%X commitLsn=%X/%X truncateLsn=%X/%X to %s:%s",
+				   req->endLsn - req->beginLsn,
+				   LSN_FORMAT_ARGS(req->beginLsn),
+				   LSN_FORMAT_ARGS(req->endLsn),
+				   LSN_FORMAT_ARGS(req->commitLsn),
+				   LSN_FORMAT_ARGS(wp->truncateLsn), sk->host, sk->port);
 
 			resetStringInfo(&sk->outbuf);
 
@@ -1355,8 +1331,8 @@ SendAppendRequests(Safekeeper *sk)
 				case NEON_WALREAD_WOULDBLOCK:
 					return true;
 				case NEON_WALREAD_ERROR:
-					walprop_log(WARNING, "WAL reading for node %s:%s failed: %s",
-								sk->host, sk->port, errmsg);
+					wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
+						   sk->host, sk->port, errmsg);
 					ShutdownConnection(sk);
 					return false;
 				default:
@@ -1388,9 +1364,9 @@ SendAppendRequests(Safekeeper *sk)
 					return true;
 
 				case PG_ASYNC_WRITE_FAIL:
-					walprop_log(WARNING, "failed to send to node %s:%s in %s state: %s",
-								sk->host, sk->port, FormatSafekeeperState(sk),
-								wp->api.conn_error_message(sk));
+					wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
+						   sk->host, sk->port, FormatSafekeeperState(sk),
+						   wp->api.conn_error_message(sk));
 					ShutdownConnection(sk);
 					return false;
 				default:
@@ -1429,11 +1405,11 @@ RecvAppendResponses(Safekeeper *sk)
 		if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) &sk->appendResponse))
 			break;
 
-		walprop_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
-					sk->appendResponse.term,
-					LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
-					LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
-					sk->host, sk->port);
+		wp_log(DEBUG2, "received message term=" INT64_FORMAT " flushLsn=%X/%X commitLsn=%X/%X from %s:%s",
+			   sk->appendResponse.term,
+			   LSN_FORMAT_ARGS(sk->appendResponse.flushLsn),
+			   LSN_FORMAT_ARGS(sk->appendResponse.commitLsn),
+			   sk->host, sk->port);
 
 		if (sk->appendResponse.term > wp->propTerm)
 		{
@@ -1443,9 +1419,9 @@ RecvAppendResponses(Safekeeper *sk)
 			 * core as this is kinda expected scenario.
 			 */
 			disable_core_dump();
-			walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
-						sk->host, sk->port,
-						sk->appendResponse.term, wp->propTerm);
+			wp_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "",
+				   sk->host, sk->port,
+				   sk->appendResponse.term, wp->propTerm);
 		}
 
 		readAnything = true;
@@ -1489,32 +1465,32 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->currentClusterSize = pq_getmsgint64(reply_message);
-			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
-						rf->currentClusterSize);
+			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: current_timeline_size %lu",
+				   rf->currentClusterSize);
 		}
 		else if ((strcmp(key, "ps_writelsn") == 0) || (strcmp(key, "last_received_lsn") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->last_received_lsn = pq_getmsgint64(reply_message);
-			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
-						LSN_FORMAT_ARGS(rf->last_received_lsn));
+			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: last_received_lsn %X/%X",
+				   LSN_FORMAT_ARGS(rf->last_received_lsn));
 		}
 		else if ((strcmp(key, "ps_flushlsn") == 0) || (strcmp(key, "disk_consistent_lsn") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->disk_consistent_lsn = pq_getmsgint64(reply_message);
-			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
-						LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
+			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: disk_consistent_lsn %X/%X",
+				   LSN_FORMAT_ARGS(rf->disk_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_applylsn") == 0) || (strcmp(key, "remote_consistent_lsn") == 0))
 		{
 			pq_getmsgint(reply_message, sizeof(int32));
 			/* read value length */
 			rf->remote_consistent_lsn = pq_getmsgint64(reply_message);
-			walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
-						LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
+			wp_log(DEBUG2, "ParsePageserverFeedbackMessage: remote_consistent_lsn %X/%X",
+				   LSN_FORMAT_ARGS(rf->remote_consistent_lsn));
 		}
 		else if ((strcmp(key, "ps_replytime") == 0) || (strcmp(key, "replytime") == 0))
 		{
@@ -1526,8 +1502,8 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 
 				/* Copy because timestamptz_to_str returns a static buffer */
 				replyTimeStr = pstrdup(timestamptz_to_str(rf->replytime));
-				walprop_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
-							rf->replytime, replyTimeStr);
+				wp_log(DEBUG2, "ParsePageserverFeedbackMessage: replytime %lu reply_time: %s",
+					   rf->replytime, replyTimeStr);
 
 				pfree(replyTimeStr);
 			}
@@ -1541,7 +1517,7 @@ ParsePageserverFeedbackMessage(WalProposer *wp, StringInfo reply_message, Pagese
 			 * Skip unknown keys to support backward compatibile protocol
 			 * changes
 			 */
-			walprop_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
+			wp_log(LOG, "ParsePageserverFeedbackMessage: unknown key: %s len %d", key, len);
 			pq_getmsgbytes(reply_message, len);
 		};
 	}
@@ -1606,7 +1582,7 @@ GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn)
 
 	if (wp->n_votes < wp->quorum)
 	{
-		walprop_log(WARNING, "GetDonor called before elections are won");
+		wp_log(WARNING, "GetDonor called before elections are won");
 		return NULL;
 	}
 
@@ -1734,9 +1710,9 @@ AsyncRead(Safekeeper *sk, char **buf, int *buf_size)
 			return false;
 
 		case PG_ASYNC_READ_FAIL:
-			walprop_log(WARNING, "Failed to read from node %s:%s in %s state: %s", sk->host,
-						sk->port, FormatSafekeeperState(sk),
-						wp->api.conn_error_message(sk));
+			wp_log(WARNING, "failed to read from node %s:%s in %s state: %s", sk->host,
+				   sk->port, FormatSafekeeperState(sk),
+				   wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
 	}
@@ -1774,8 +1750,8 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 	tag = pq_getmsgint64_le(&s);
 	if (tag != anymsg->tag)
 	{
-		walprop_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
-					sk->port, FormatSafekeeperState(sk));
+		wp_log(WARNING, "unexpected message tag %c from node %s:%s in state %s", (char) tag, sk->host,
+			   sk->port, FormatSafekeeperState(sk));
 		ResetConnection(sk);
 		return false;
 	}
@@ -1851,9 +1827,9 @@ BlockingWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState succes
 
 	if (!wp->api.conn_blocking_write(sk, msg, msg_size))
 	{
-		walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-					sk->host, sk->port, FormatSafekeeperState(sk),
-					wp->api.conn_error_message(sk));
+		wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
+			   sk->host, sk->port, FormatSafekeeperState(sk),
+			   wp->api.conn_error_message(sk));
 		ShutdownConnection(sk);
 		return false;
 	}
@@ -1904,9 +1880,9 @@ AsyncWrite(Safekeeper *sk, void *msg, size_t msg_size, SafekeeperState flush_sta
 			wp->api.update_event_set(sk, WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE);
 			return false;
 		case PG_ASYNC_WRITE_FAIL:
-			walprop_log(WARNING, "Failed to send to node %s:%s in %s state: %s",
-						sk->host, sk->port, FormatSafekeeperState(sk),
-						wp->api.conn_error_message(sk));
+			wp_log(WARNING, "failed to send to node %s:%s in %s state: %s",
+				   sk->host, sk->port, FormatSafekeeperState(sk),
+				   wp->api.conn_error_message(sk));
 			ShutdownConnection(sk);
 			return false;
 		default:
@@ -1943,9 +1919,9 @@ AsyncFlush(Safekeeper *sk)
 			/* Nothing to do; try again when the socket's ready */
 			return false;
 		case -1:
-			walprop_log(WARNING, "Failed to flush write to node %s:%s in %s state: %s",
-						sk->host, sk->port, FormatSafekeeperState(sk),
-						wp->api.conn_error_message(sk));
+			wp_log(WARNING, "failed to flush write to node %s:%s in %s state: %s",
+				   sk->host, sk->port, FormatSafekeeperState(sk),
+				   wp->api.conn_error_message(sk));
 			ResetConnection(sk);
 			return false;
 		default:
@@ -1974,11 +1950,11 @@ CompareLsn(const void *a, const void *b)
  *
  * The strings are intended to be used as a prefix to "state", e.g.:
  *
- *   walprop_log(LOG, "currently in %s state", FormatSafekeeperState(sk));
+ *   wp_log(LOG, "currently in %s state", FormatSafekeeperState(sk));
  *
  * If this sort of phrasing doesn't fit the message, instead use something like:
  *
- *   walprop_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk));
+ *   wp_log(LOG, "currently in state [%s]", FormatSafekeeperState(sk));
  */
 static char *
 FormatSafekeeperState(Safekeeper *sk)
@@ -2059,8 +2035,8 @@ AssertEventsOkForState(uint32 events, Safekeeper *sk)
 		 * To give a descriptive message in the case of failure, we use elog
 		 * and then an assertion that's guaranteed to fail.
 		 */
-		walprop_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
-					FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk));
+		wp_log(WARNING, "events %s mismatched for safekeeper %s:%s in state [%s]",
+			   FormatEvents(wp, events), sk->host, sk->port, FormatSafekeeperState(sk));
 		Assert(events_ok_for_state);
 	}
 }
@@ -2199,8 +2175,8 @@ FormatEvents(WalProposer *wp, uint32 events)
 
 	if (events & (~all_flags))
 	{
-		walprop_log(WARNING, "Event formatting found unexpected component %d",
-					events & (~all_flags));
+		wp_log(WARNING, "event formatting found unexpected component %d",
+			   events & (~all_flags));
 		return_str[6] = '*';
 		return_str[7] = '\0';
 	}
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 6d478076fe..688d8e6e52 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -707,11 +707,23 @@ extern Safekeeper *GetDonor(WalProposer *wp, XLogRecPtr *donor_lsn);
 #define WPEVENT		1337		/* special log level for walproposer internal
 								 * events */
 
+#define WP_LOG_PREFIX "[WP] "
+
+/*
+ * wp_log is used in pure wp code (walproposer.c), allowing API callback to
+ * catch logging.
+ */
 #ifdef WALPROPOSER_LIB
 extern void WalProposerLibLog(WalProposer *wp, int elevel, char *fmt,...);
-#define walprop_log(elevel, ...) WalProposerLibLog(wp, elevel, __VA_ARGS__)
+#define wp_log(elevel, fmt, ...) WalProposerLibLog(wp, elevel, fmt, ## __VA_ARGS__)
 #else
-#define walprop_log(elevel, ...) elog(elevel, __VA_ARGS__)
+#define wp_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
 #endif
 
+/*
+ * And wpg_log is used all other (postgres specific) walproposer code, just
+ * adding prefix.
+ */
+#define wpg_log(elevel, fmt, ...) elog(elevel, WP_LOG_PREFIX fmt, ## __VA_ARGS__)
+
 #endif							/* __NEON_WALPROPOSER_H__ */
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 7773aabfab..61a2a54809 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -424,8 +424,8 @@ walprop_pg_start_streaming(WalProposer *wp, XLogRecPtr startpos)
 {
 	StartReplicationCmd cmd;
 
-	elog(LOG, "WAL proposer starts streaming at %X/%X",
-		 LSN_FORMAT_ARGS(startpos));
+	wpg_log(LOG, "WAL proposer starts streaming at %X/%X",
+			LSN_FORMAT_ARGS(startpos));
 	cmd.slotname = WAL_PROPOSER_SLOT_NAME;
 	cmd.timeline = wp->greetRequest.timeline;
 	cmd.startpoint = startpos;
@@ -549,7 +549,7 @@ walprop_pg_load_libpqwalreceiver(void)
 {
 	load_file("libpqwalreceiver", false);
 	if (WalReceiverFunctions == NULL)
-		elog(ERROR, "libpqwalreceiver didn't initialize correctly");
+		wpg_log(ERROR, "libpqwalreceiver didn't initialize correctly");
 }
 
 /* Helper function */
@@ -630,7 +630,7 @@ libpqwp_connect_start(char *conninfo)
 	 * PGconn structure"
 	 */
 	if (!pg_conn)
-		elog(FATAL, "failed to allocate new PGconn object");
+		wpg_log(FATAL, "failed to allocate new PGconn object");
 
 	/*
 	 * And in theory this allocation can fail as well, but it's incredibly
@@ -680,7 +680,7 @@ walprop_connect_poll(Safekeeper *sk)
 			 * unused. We'll expect it's never returned.
 			 */
 		case PGRES_POLLING_ACTIVE:
-			elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
+			wpg_log(FATAL, "unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
 
 			/*
 			 * This return is never actually reached, but it's here to make
@@ -745,7 +745,7 @@ libpqwp_get_query_result(WalProposerConn *conn)
 	 */
 	if (!result)
 	{
-		elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
+		wpg_log(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
 		return WP_EXEC_UNEXPECTED_SUCCESS;
 	}
 
@@ -793,7 +793,7 @@ libpqwp_get_query_result(WalProposerConn *conn)
 	}
 
 	if (unexpected_success)
-		elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
+		wpg_log(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
 
 	return return_val;
 }
@@ -872,7 +872,7 @@ libpqwp_async_read(WalProposerConn *conn, char **buf, int *amount)
 				ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
 
 				if (status != PGRES_FATAL_ERROR)
-					elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
+					wpg_log(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
 
 				/*
 				 * If there was actually an error, it'll be properly reported
@@ -937,7 +937,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 		case -1:
 			return PG_ASYNC_WRITE_FAIL;
 		default:
-			elog(FATAL, "invalid return %d from PQputCopyData", result);
+			wpg_log(FATAL, "invalid return %d from PQputCopyData", result);
 	}
 
 	/*
@@ -958,7 +958,7 @@ walprop_async_write(Safekeeper *sk, void const *buf, size_t size)
 		case -1:
 			return PG_ASYNC_WRITE_FAIL;
 		default:
-			elog(FATAL, "invalid return %d from PQflush", result);
+			wpg_log(FATAL, "invalid return %d from PQflush", result);
 	}
 }
 
@@ -1237,19 +1237,6 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 		return true;			/* recovery not needed */
 	endpos = wp->propEpochStartLsn;
 
-	/*
-	 * If we need to download more than a max_slot_wal_keep_size, cap to it to
-	 * avoid risk of exploding pg_wal. Logical replication won't work until
-	 * recreated, but at least compute would start; this also follows
-	 * max_slot_wal_keep_size semantics.
-	 */
-	download_range_mb = (endpos - startpos) / 1024 / 1024;
-	if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
-	{
-		startpos = endpos - max_slot_wal_keep_size_mb * 1024 * 1024;
-		walprop_log(WARNING, "capped WAL download for logical replication to %X/%X as max_slot_wal_keep_size=%dMB",
-					LSN_FORMAT_ARGS(startpos), max_slot_wal_keep_size_mb);
-	}
 	timeline = wp->greetRequest.timeline;
 
 	if (!neon_auth_token)
@@ -1262,7 +1249,7 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 
 		written = snprintf((char *) conninfo, MAXCONNINFO, "password=%s %s", neon_auth_token, sk->conninfo);
 		if (written > MAXCONNINFO || written < 0)
-			elog(FATAL, "could not append password to the safekeeper connection string");
+			wpg_log(FATAL, "could not append password to the safekeeper connection string");
 	}
 
 #if PG_MAJORVERSION_NUM < 16
@@ -1279,11 +1266,11 @@ WalProposerRecovery(WalProposer *wp, Safekeeper *sk)
 						err)));
 		return false;
 	}
-	elog(LOG,
-		 "start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
-		 "%d",
-		 sk->host, sk->port, (uint32) (startpos >> 32),
-		 (uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
+	wpg_log(LOG,
+			"start recovery for logical replication from %s:%s starting from %X/%08X till %X/%08X timeline "
+			"%d",
+			sk->host, sk->port, (uint32) (startpos >> 32),
+			(uint32) startpos, (uint32) (endpos >> 32), (uint32) endpos, timeline);
 
 	options.logical = false;
 	options.startpoint = startpos;
@@ -1481,11 +1468,11 @@ walprop_pg_wal_reader_allocate(Safekeeper *sk)
 {
 	char		log_prefix[64];
 
-	snprintf(log_prefix, sizeof(log_prefix), "sk %s:%s nwr: ", sk->host, sk->port);
+	snprintf(log_prefix, sizeof(log_prefix), WP_LOG_PREFIX "sk %s:%s nwr: ", sk->host, sk->port);
 	Assert(!sk->xlogreader);
 	sk->xlogreader = NeonWALReaderAllocate(wal_segment_size, sk->wp->propEpochStartLsn, sk->wp, log_prefix);
 	if (sk->xlogreader == NULL)
-		elog(FATAL, "Failed to allocate xlog reader");
+		wpg_log(FATAL, "failed to allocate xlog reader");
 }
 
 static NeonWALReadResult
@@ -1549,7 +1536,7 @@ static void
 walprop_pg_init_event_set(WalProposer *wp)
 {
 	if (waitEvents)
-		elog(FATAL, "double-initialization of event set");
+		wpg_log(FATAL, "double-initialization of event set");
 
 	/* for each sk, we have socket plus potentially socket for neon walreader */
 	waitEvents = CreateWaitEventSet(TopMemoryContext, 2 + 2 * wp->n_safekeepers);
@@ -1581,7 +1568,7 @@ add_nwr_event_set(Safekeeper *sk, uint32 events)
 	Assert(sk->nwrEventPos == -1);
 	sk->nwrEventPos = AddWaitEventToSet(waitEvents, events, NeonWALReaderSocket(sk->xlogreader), NULL, sk);
 	sk->nwrConnEstablished = NeonWALReaderIsRemConnEstablished(sk->xlogreader);
-	elog(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
+	wpg_log(DEBUG5, "sk %s:%s: added nwr socket events %d", sk->host, sk->port, events);
 }
 
 static void
@@ -1680,8 +1667,8 @@ rm_safekeeper_event_set(Safekeeper *to_remove, bool is_sk)
 {
 	WalProposer *wp = to_remove->wp;
 
-	elog(DEBUG5, "sk %s:%s: removing event, is_sk %d",
-		 to_remove->host, to_remove->port, is_sk);
+	wpg_log(DEBUG5, "sk %s:%s: removing event, is_sk %d",
+			to_remove->host, to_remove->port, is_sk);
 
 	/*
 	 * Shortpath for exiting if have nothing to do. We never call this
@@ -1835,13 +1822,13 @@ GetLatestNeonFeedback(PageserverFeedback *rf, WalProposer *wp)
 	rf->remote_consistent_lsn = wp->safekeeper[latest_safekeeper].appendResponse.rf.remote_consistent_lsn;
 	rf->replytime = wp->safekeeper[latest_safekeeper].appendResponse.rf.replytime;
 
-	elog(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
-		 " last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
-		 rf->currentClusterSize,
-		 LSN_FORMAT_ARGS(rf->last_received_lsn),
-		 LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
-		 LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
-		 rf->replytime);
+	wpg_log(DEBUG2, "GetLatestNeonFeedback: currentClusterSize %lu,"
+			" last_received_lsn %X/%X, disk_consistent_lsn %X/%X, remote_consistent_lsn %X/%X, replytime %lu",
+			rf->currentClusterSize,
+			LSN_FORMAT_ARGS(rf->last_received_lsn),
+			LSN_FORMAT_ARGS(rf->disk_consistent_lsn),
+			LSN_FORMAT_ARGS(rf->remote_consistent_lsn),
+			rf->replytime);
 }
 
 /*
@@ -1987,7 +1974,7 @@ GetLogRepRestartLSN(WalProposer *wp)
 		{
 			uint64		download_range_mb;
 
-			elog(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
+			wpg_log(LOG, "logical replication restart LSN %X/%X", LSN_FORMAT_ARGS(lrRestartLsn));
 
 			/*
 			 * If we need to download more than a max_slot_wal_keep_size,
@@ -1999,8 +1986,8 @@ GetLogRepRestartLSN(WalProposer *wp)
 			download_range_mb = (wp->propEpochStartLsn - lrRestartLsn) / MB;
 			if (max_slot_wal_keep_size_mb > 0 && download_range_mb >= max_slot_wal_keep_size_mb)
 			{
-				walprop_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
-							LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
+				wpg_log(WARNING, "not downloading WAL for logical replication since %X/%X as max_slot_wal_keep_size=%dMB",
+						LSN_FORMAT_ARGS(lrRestartLsn), max_slot_wal_keep_size_mb);
 				return InvalidXLogRecPtr;
 			}
 
diff --git a/poetry.lock b/poetry.lock
index 76dfd6d37d..428698cb5a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -288,70 +288,21 @@ files = [
     {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"},
 ]
 
-[[package]]
-name = "black"
-version = "23.3.0"
-description = "The uncompromising code formatter."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"},
-    {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"},
-    {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"},
-    {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"},
-    {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"},
-    {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"},
-    {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"},
-    {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"},
-    {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"},
-    {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"},
-    {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"},
-    {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"},
-    {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"},
-    {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"},
-    {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"},
-    {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"},
-    {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"},
-    {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"},
-    {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"},
-    {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"},
-    {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"},
-    {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"},
-    {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"},
-    {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"},
-    {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"},
-]
-
-[package.dependencies]
-click = ">=8.0.0"
-mypy-extensions = ">=0.4.3"
-packaging = ">=22.0"
-pathspec = ">=0.9.0"
-platformdirs = ">=2"
-tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
-typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}
-
-[package.extras]
-colorama = ["colorama (>=0.4.3)"]
-d = ["aiohttp (>=3.7.4)"]
-jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
-uvloop = ["uvloop (>=0.15.2)"]
-
 [[package]]
 name = "boto3"
-version = "1.26.16"
+version = "1.34.11"
 description = "The AWS SDK for Python"
 optional = false
-python-versions = ">= 3.7"
+python-versions = ">= 3.8"
 files = [
-    {file = "boto3-1.26.16-py3-none-any.whl", hash = "sha256:4f493a2aed71cee93e626de4f67ce58dd82c0473480a0fc45b131715cd8f4f30"},
-    {file = "boto3-1.26.16.tar.gz", hash = "sha256:31c0adf71e4bd19a5428580bb229d7ea3b5795eecaa0847a85385df00c026116"},
+    {file = "boto3-1.34.11-py3-none-any.whl", hash = "sha256:1af021e0c6e3040e8de66d403e963566476235bb70f9a8e3f6784813ac2d8026"},
+    {file = "boto3-1.34.11.tar.gz", hash = "sha256:31c130a40ec0631059b77d7e87f67ad03ff1685a5b37638ac0c4687026a3259d"},
 ]
 
 [package.dependencies]
-botocore = ">=1.29.16,<1.30.0"
+botocore = ">=1.34.11,<1.35.0"
 jmespath = ">=0.7.1,<2.0.0"
-s3transfer = ">=0.6.0,<0.7.0"
+s3transfer = ">=0.10.0,<0.11.0"
 
 [package.extras]
 crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
@@ -702,22 +653,25 @@ xray = ["mypy-boto3-xray (>=1.26.0,<1.27.0)"]
 
 [[package]]
 name = "botocore"
-version = "1.29.16"
+version = "1.34.11"
 description = "Low-level, data-driven core of boto 3."
 optional = false
-python-versions = ">= 3.7"
+python-versions = ">= 3.8"
 files = [
-    {file = "botocore-1.29.16-py3-none-any.whl", hash = "sha256:271b599e6cfe214405ed50d41cd967add1d5d469383dd81ff583bc818b47f59b"},
-    {file = "botocore-1.29.16.tar.gz", hash = "sha256:8cfcc10f2f1751608c3cec694f2d6b5e16ebcd50d0a104f9914d5616227c62e9"},
+    {file = "botocore-1.34.11-py3-none-any.whl", hash = "sha256:1ff1398b6ea670e1c01ac67a33af3da854f8e700d3528289c04f319c330d8250"},
+    {file = "botocore-1.34.11.tar.gz", hash = "sha256:51905c3d623c60df5dc5794387de7caf886d350180a01a3dfa762e903edb45a9"},
 ]
 
 [package.dependencies]
 jmespath = ">=0.7.1,<2.0.0"
 python-dateutil = ">=2.1,<3.0.0"
-urllib3 = ">=1.25.4,<1.27"
+urllib3 = [
+    {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""},
+    {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""},
+]
 
 [package.extras]
-crt = ["awscrt (==0.14.0)"]
+crt = ["awscrt (==0.19.19)"]
 
 [[package]]
 name = "botocore-stubs"
@@ -1624,17 +1578,6 @@ files = [
     {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"},
 ]
 
-[[package]]
-name = "pathspec"
-version = "0.9.0"
-description = "Utility library for gitignore style pattern matching of file paths."
-optional = false
-python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
-files = [
-    {file = "pathspec-0.9.0-py2.py3-none-any.whl", hash = "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a"},
-    {file = "pathspec-0.9.0.tar.gz", hash = "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1"},
-]
-
 [[package]]
 name = "pbr"
 version = "5.9.0"
@@ -1646,21 +1589,6 @@ files = [
     {file = "pbr-5.9.0.tar.gz", hash = "sha256:e8dca2f4b43560edef58813969f52a56cef023146cbb8931626db80e6c1c4308"},
 ]
 
-[[package]]
-name = "platformdirs"
-version = "2.5.2"
-description = "A small Python module for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "platformdirs-2.5.2-py3-none-any.whl", hash = "sha256:027d8e83a2d7de06bbac4e5ef7e023c02b863d7ea5d079477e722bb41ab25788"},
-    {file = "platformdirs-2.5.2.tar.gz", hash = "sha256:58c8abb07dcb441e6ee4b11d8df0ac856038f944ab98b7be6b27b2a3c7feef19"},
-]
-
-[package.extras]
-docs = ["furo (>=2021.7.5b38)", "proselint (>=0.10.2)", "sphinx (>=4)", "sphinx-autodoc-typehints (>=1.12)"]
-test = ["appdirs (==1.4.4)", "pytest (>=6)", "pytest-cov (>=2.7)", "pytest-mock (>=3.6)"]
-
 [[package]]
 name = "pluggy"
 version = "1.0.0"
@@ -1889,13 +1817,13 @@ files = [
 
 [[package]]
 name = "pytest"
-version = "7.3.1"
+version = "7.4.4"
 description = "pytest: simple powerful testing with Python"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-7.3.1-py3-none-any.whl", hash = "sha256:3799fa815351fea3a5e96ac7e503a96fa51cc9942c3753cda7651b93c1cfa362"},
-    {file = "pytest-7.3.1.tar.gz", hash = "sha256:434afafd78b1d78ed0addf160ad2b77a30d35d4bdf8af234fe621919d9ed15e3"},
+    {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"},
+    {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"},
 ]
 
 [package.dependencies]
@@ -1907,7 +1835,7 @@ pluggy = ">=0.12,<2.0"
 tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 
 [package.extras]
-testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
+testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
 
 [[package]]
 name = "pytest-asyncio"
@@ -2204,46 +2132,46 @@ pyasn1 = ">=0.1.3"
 
 [[package]]
 name = "ruff"
-version = "0.0.269"
-description = "An extremely fast Python linter, written in Rust."
+version = "0.1.11"
+description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.0.269-py3-none-macosx_10_7_x86_64.whl", hash = "sha256:3569bcdee679045c09c0161fabc057599759c49219a08d9a4aad2cc3982ccba3"},
-    {file = "ruff-0.0.269-py3-none-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:56347da63757a56cbce7d4b3d6044ca4f1941cd1bbff3714f7554360c3361f83"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6da8ee25ef2f0cc6cc8e6e20942c1d44d25a36dce35070d7184655bc14f63f63"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bd81b8e681b9eaa6cf15484f3985bd8bd97c3d114e95bff3e8ea283bf8865062"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f19f59ca3c28742955241fb452f3346241ddbd34e72ac5cb3d84fadebcf6bc8"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:f062059b8289a4fab7f6064601b811d447c2f9d3d432a17f689efe4d68988450"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3f5dc7aac52c58e82510217e3c7efd80765c134c097c2815d59e40face0d1fe6"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e131b4dbe798c391090c6407641d6ab12c0fa1bb952379dde45e5000e208dabb"},
-    {file = "ruff-0.0.269-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a374434e588e06550df0f8dcb74777290f285678de991fda4e1063c367ab2eb2"},
-    {file = "ruff-0.0.269-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:cec2f4b84a14b87f1b121488649eb5b4eaa06467a2387373f750da74bdcb5679"},
-    {file = "ruff-0.0.269-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:374b161753a247904aec7a32d45e165302b76b6e83d22d099bf3ff7c232c888f"},
-    {file = "ruff-0.0.269-py3-none-musllinux_1_2_i686.whl", hash = "sha256:9ca0a1ddb1d835b5f742db9711c6cf59f213a1ad0088cb1e924a005fd399e7d8"},
-    {file = "ruff-0.0.269-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:5a20658f0b97d207c7841c13d528f36d666bf445b00b01139f28a8ccb80093bb"},
-    {file = "ruff-0.0.269-py3-none-win32.whl", hash = "sha256:03ff42bc91ceca58e0f0f072cb3f9286a9208f609812753474e799a997cdad1a"},
-    {file = "ruff-0.0.269-py3-none-win_amd64.whl", hash = "sha256:f3b59ccff57b21ef0967ea8021fd187ec14c528ec65507d8bcbe035912050776"},
-    {file = "ruff-0.0.269-py3-none-win_arm64.whl", hash = "sha256:bbeb857b1e508a4487bdb02ca1e6d41dd8d5ac5335a5246e25de8a3dff38c1ff"},
-    {file = "ruff-0.0.269.tar.gz", hash = "sha256:11ddcfbab32cf5c420ea9dd5531170ace5a3e59c16d9251c7bd2581f7b16f602"},
+    {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a7f772696b4cdc0a3b2e527fc3c7ccc41cdcb98f5c80fdd4f2b8c50eb1458196"},
+    {file = "ruff-0.1.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:934832f6ed9b34a7d5feea58972635c2039c7a3b434fe5ba2ce015064cb6e955"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea0d3e950e394c4b332bcdd112aa566010a9f9c95814844a7468325290aabfd9"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:9bd4025b9c5b429a48280785a2b71d479798a69f5c2919e7d274c5f4b32c3607"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e1ad00662305dcb1e987f5ec214d31f7d6a062cae3e74c1cbccef15afd96611d"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:4b077ce83f47dd6bea1991af08b140e8b8339f0ba8cb9b7a484c30ebab18a23f"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4a88efecec23c37b11076fe676e15c6cdb1271a38f2b415e381e87fe4517f18"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5b25093dad3b055667730a9b491129c42d45e11cdb7043b702e97125bcec48a1"},
+    {file = "ruff-0.1.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:231d8fb11b2cc7c0366a326a66dafc6ad449d7fcdbc268497ee47e1334f66f77"},
+    {file = "ruff-0.1.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:09c415716884950080921dd6237767e52e227e397e2008e2bed410117679975b"},
+    {file = "ruff-0.1.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0f58948c6d212a6b8d41cd59e349751018797ce1727f961c2fa755ad6208ba45"},
+    {file = "ruff-0.1.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:190a566c8f766c37074d99640cd9ca3da11d8deae2deae7c9505e68a4a30f740"},
+    {file = "ruff-0.1.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:6464289bd67b2344d2a5d9158d5eb81025258f169e69a46b741b396ffb0cda95"},
+    {file = "ruff-0.1.11-py3-none-win32.whl", hash = "sha256:9b8f397902f92bc2e70fb6bebfa2139008dc72ae5177e66c383fa5426cb0bf2c"},
+    {file = "ruff-0.1.11-py3-none-win_amd64.whl", hash = "sha256:eb85ee287b11f901037a6683b2374bb0ec82928c5cbc984f575d0437979c521a"},
+    {file = "ruff-0.1.11-py3-none-win_arm64.whl", hash = "sha256:97ce4d752f964ba559c7023a86e5f8e97f026d511e48013987623915431c7ea9"},
+    {file = "ruff-0.1.11.tar.gz", hash = "sha256:f9d4d88cb6eeb4dfe20f9f0519bd2eaba8119bde87c3d5065c541dbae2b5a2cb"},
 ]
 
 [[package]]
 name = "s3transfer"
-version = "0.6.0"
+version = "0.10.0"
 description = "An Amazon S3 Transfer Manager"
 optional = false
-python-versions = ">= 3.7"
+python-versions = ">= 3.8"
 files = [
-    {file = "s3transfer-0.6.0-py3-none-any.whl", hash = "sha256:06176b74f3a15f61f1b4f25a1fc29a4429040b7647133a463da8fa5bd28d5ecd"},
-    {file = "s3transfer-0.6.0.tar.gz", hash = "sha256:2ed07d3866f523cc561bf4a00fc5535827981b117dd7876f036b0c1aca42c947"},
+    {file = "s3transfer-0.10.0-py3-none-any.whl", hash = "sha256:3cdb40f5cfa6966e812209d0994f2a4709b561c88e90cf00c2696d2df4e56b2e"},
+    {file = "s3transfer-0.10.0.tar.gz", hash = "sha256:d0c8bbf672d5eebbe4e57945e23b972d963f07d82f661cabf678a5c88831595b"},
 ]
 
 [package.dependencies]
-botocore = ">=1.12.36,<2.0a.0"
+botocore = ">=1.33.2,<2.0a.0"
 
 [package.extras]
-crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"]
+crt = ["botocore[crt] (>=1.33.2,<2.0a.0)"]
 
 [[package]]
 name = "sarif-om"
@@ -2493,16 +2421,6 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2740,4 +2658,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "c4e38082d246636903e15c02fbf8364c6afc1fd35d36a81c49f596ba68fc739b"
+content-hash = "35c237fe6a9278b2dc65b06ed96bde5afb9e393d52c01b00c59acf1df3a8d482"
diff --git a/pre-commit.py b/pre-commit.py
index dc0b9ed588..c5ed63ac44 100755
--- a/pre-commit.py
+++ b/pre-commit.py
@@ -36,17 +36,17 @@ def rustfmt(fix_inplace: bool = False, no_color: bool = False) -> str:
     return cmd
 
 
-def black(fix_inplace: bool) -> str:
-    cmd = "poetry run black"
-    if not fix_inplace:
-        cmd += " --diff --check"
+def ruff_check(fix_inplace: bool) -> str:
+    cmd = "poetry run ruff check"
+    if fix_inplace:
+        cmd += " --fix"
     return cmd
 
 
-def ruff(fix_inplace: bool) -> str:
-    cmd = "poetry run ruff"
-    if fix_inplace:
-        cmd += " --fix"
+def ruff_format(fix_inplace: bool) -> str:
+    cmd = "poetry run ruff format"
+    if not fix_inplace:
+        cmd += " --diff --check"
     return cmd
 
 
@@ -109,16 +109,16 @@ if __name__ == "__main__":
         no_color=args.no_color,
     )
     check(
-        name="black",
+        name="ruff check",
         suffix=".py",
-        cmd=black(fix_inplace=args.fix_inplace),
+        cmd=ruff_check(fix_inplace=args.fix_inplace),
         changed_files=files,
         no_color=args.no_color,
     )
     check(
-        name="ruff",
+        name="ruff format",
         suffix=".py",
-        cmd=ruff(fix_inplace=args.fix_inplace),
+        cmd=ruff_format(fix_inplace=args.fix_inplace),
         changed_files=files,
         no_color=args.no_color,
     )
diff --git a/pyproject.toml b/pyproject.toml
index 401acaeba4..bb04123e05 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ authors = []
 
 [tool.poetry.dependencies]
 python = "^3.9"
-pytest = "^7.3.1"
+pytest = "^7.4.4"
 psycopg2-binary = "^2.9.6"
 typing-extensions = "^4.6.1"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
@@ -17,7 +17,7 @@ aiopg = "^1.4.0"
 Jinja2 = "^3.0.2"
 types-requests = "^2.31.0.0"
 types-psycopg2 = "^2.9.21.10"
-boto3 = "^1.26.16"
+boto3 = "^1.34.11"
 boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
 moto = {extras = ["server"], version = "^4.1.2"}
 backoff = "^2.2.1"
@@ -40,22 +40,13 @@ pytest-split = "^0.8.1"
 zstandard = "^0.21.0"
 
 [tool.poetry.group.dev.dependencies]
-black = "^23.3.0"
 mypy = "==1.3.0"
-ruff = "^0.0.269"
+ruff = "^0.1.11"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
 
-[tool.black]
-line-length = 100
-extend-exclude = '''
-/(
-    vendor
-)/
-'''
-
 [tool.mypy]
 exclude = "^vendor/"
 check_untyped_defs = true
@@ -82,7 +73,9 @@ ignore_missing_imports = true
 [tool.ruff]
 target-version = "py39"
 extend-exclude = ["vendor/"]
-ignore = ["E501"]
+ignore = [
+    "E501", # Line too long, we don't want to be too strict about it
+]
 select = [
     "E", # pycodestyle
     "F", # Pyflakes
@@ -90,3 +83,4 @@ select = [
     "W", # pycodestyle
     "B", # bugbear
 ]
+line-length = 100 # this setting is rather guidance, it won't fail if it can't make the shorter
diff --git a/s3_scrubber/Cargo.toml b/s3_scrubber/Cargo.toml
index fdae378d55..4d136472e0 100644
--- a/s3_scrubber/Cargo.toml
+++ b/s3_scrubber/Cargo.toml
@@ -6,6 +6,7 @@ license.workspace = true
 
 [dependencies]
 aws-sdk-s3.workspace = true
+aws-smithy-async.workspace = true
 either.workspace = true
 tokio-rustls.workspace = true
 anyhow.workspace = true
@@ -39,3 +40,5 @@ tracing-subscriber.workspace = true
 clap.workspace = true
 tracing-appender = "0.2"
 histogram = "0.7"
+
+futures.workspace = true
diff --git a/s3_scrubber/src/lib.rs b/s3_scrubber/src/lib.rs
index 8fb1346c8e..d2842877d0 100644
--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -16,10 +16,12 @@ use aws_config::environment::EnvironmentVariableCredentialsProvider;
 use aws_config::imds::credentials::ImdsCredentialsProvider;
 use aws_config::meta::credentials::CredentialsProviderChain;
 use aws_config::profile::ProfileFileCredentialsProvider;
+use aws_config::retry::RetryConfig;
 use aws_config::sso::SsoCredentialsProvider;
 use aws_config::BehaviorVersion;
-use aws_sdk_s3::config::Region;
+use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep};
 use aws_sdk_s3::{Client, Config};
+use aws_smithy_async::rt::sleep::TokioSleep;
 
 use clap::ValueEnum;
 use pageserver::tenant::TENANTS_SEGMENT_NAME;
@@ -283,9 +285,13 @@ pub fn init_s3_client(account_id: Option<String>, bucket_region: Region) -> Clie
         )
     };
 
+    let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
+
     let mut builder = Config::builder()
         .behavior_version(BehaviorVersion::v2023_11_09())
         .region(bucket_region)
+        .retry_config(RetryConfig::adaptive().with_max_attempts(3))
+        .sleep_impl(SharedAsyncSleep::from(sleep_impl))
         .credentials_provider(credentials_provider);
 
     if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") {
diff --git a/s3_scrubber/src/main.rs b/s3_scrubber/src/main.rs
index ef020edc2a..957213856b 100644
--- a/s3_scrubber/src/main.rs
+++ b/s3_scrubber/src/main.rs
@@ -1,3 +1,4 @@
+use pageserver_api::shard::TenantShardId;
 use s3_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use s3_scrubber::scan_metadata::scan_metadata;
 use s3_scrubber::{init_logging, BucketConfig, ConsoleConfig, NodeKind, TraversingDepth};
@@ -34,6 +35,8 @@ enum Command {
     ScanMetadata {
         #[arg(short, long, default_value_t = false)]
         json: bool,
+        #[arg(long = "tenant-id", num_args = 0..)]
+        tenant_ids: Vec<TenantShardId>,
     },
 }
 
@@ -57,35 +60,37 @@ async fn main() -> anyhow::Result<()> {
     ));
 
     match cli.command {
-        Command::ScanMetadata { json } => match scan_metadata(bucket_config.clone()).await {
-            Err(e) => {
-                tracing::error!("Failed: {e}");
-                Err(e)
-            }
-            Ok(summary) => {
-                if json {
-                    println!("{}", serde_json::to_string(&summary).unwrap())
-                } else {
-                    println!("{}", summary.summary_string());
+        Command::ScanMetadata { json, tenant_ids } => {
+            match scan_metadata(bucket_config.clone(), tenant_ids).await {
+                Err(e) => {
+                    tracing::error!("Failed: {e}");
+                    Err(e)
                 }
-                if summary.is_fatal() {
-                    Err(anyhow::anyhow!("Fatal scrub errors detected"))
-                } else if summary.is_empty() {
-                    // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
-                    // scrubber they were likely expecting to scan something, and if we see no timelines
-                    // at all then it's likely due to some configuration issues like a bad prefix
-                    Err(anyhow::anyhow!(
-                        "No timelines found in bucket {} prefix {}",
-                        bucket_config.bucket,
-                        bucket_config
-                            .prefix_in_bucket
-                            .unwrap_or("<none>".to_string())
-                    ))
-                } else {
-                    Ok(())
+                Ok(summary) => {
+                    if json {
+                        println!("{}", serde_json::to_string(&summary).unwrap())
+                    } else {
+                        println!("{}", summary.summary_string());
+                    }
+                    if summary.is_fatal() {
+                        Err(anyhow::anyhow!("Fatal scrub errors detected"))
+                    } else if summary.is_empty() {
+                        // Strictly speaking an empty bucket is a valid bucket, but if someone ran the
+                        // scrubber they were likely expecting to scan something, and if we see no timelines
+                        // at all then it's likely due to some configuration issues like a bad prefix
+                        Err(anyhow::anyhow!(
+                            "No timelines found in bucket {} prefix {}",
+                            bucket_config.bucket,
+                            bucket_config
+                                .prefix_in_bucket
+                                .unwrap_or("<none>".to_string())
+                        ))
+                    } else {
+                        Ok(())
+                    }
                 }
             }
-        },
+        }
         Command::FindGarbage {
             node_kind,
             depth,
diff --git a/s3_scrubber/src/scan_metadata.rs b/s3_scrubber/src/scan_metadata.rs
index bcc4d2e618..bfde8f0213 100644
--- a/s3_scrubber/src/scan_metadata.rs
+++ b/s3_scrubber/src/scan_metadata.rs
@@ -187,10 +187,17 @@ Timeline layer count: {6}
 }
 
 /// Scan the pageserver metadata in an S3 bucket, reporting errors and statistics.
-pub async fn scan_metadata(bucket_config: BucketConfig) -> anyhow::Result<MetadataSummary> {
+pub async fn scan_metadata(
+    bucket_config: BucketConfig,
+    tenant_ids: Vec<TenantShardId>,
+) -> anyhow::Result<MetadataSummary> {
     let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?;
 
-    let tenants = stream_tenants(&s3_client, &target);
+    let tenants = if tenant_ids.is_empty() {
+        futures::future::Either::Left(stream_tenants(&s3_client, &target))
+    } else {
+        futures::future::Either::Right(futures::stream::iter(tenant_ids.into_iter().map(Ok)))
+    };
 
     // How many tenants to process in parallel.  We need to be mindful of pageservers
     // accessing the same per tenant prefixes, so use a lower setting than pageservers.
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 4015c27933..364cad7892 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -54,6 +54,7 @@ postgres_ffi.workspace = true
 pq_proto.workspace = true
 remote_storage.workspace = true
 safekeeper_api.workspace = true
+sha2.workspace = true
 sd-notify.workspace = true
 storage_broker.workspace = true
 tokio-stream.workspace = true
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index 7aadd67ac6..591bfea182 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -66,12 +66,10 @@ impl FileStorage {
 
     /// Create file storage for a new timeline, but don't persist it yet.
     pub fn create_new(
-        ttid: &TenantTimelineId,
+        timeline_dir: Utf8PathBuf,
         conf: &SafeKeeperConf,
         state: SafeKeeperState,
     ) -> Result<FileStorage> {
-        let timeline_dir = conf.timeline_dir(ttid);
-
         let store = FileStorage {
             timeline_dir,
             conf: conf.clone(),
@@ -277,7 +275,8 @@ mod test {
             .await
             .expect("failed to create timeline dir");
         let state = SafeKeeperState::empty();
-        let storage = FileStorage::create_new(ttid, conf, state.clone())?;
+        let timeline_dir = conf.timeline_dir(ttid);
+        let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?;
         Ok((storage, state))
     }
 
diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
new file mode 100644
index 0000000000..ef88eb27e3
--- /dev/null
+++ b/safekeeper/src/copy_timeline.rs
@@ -0,0 +1,250 @@
+use std::sync::Arc;
+
+use anyhow::{bail, Result};
+use camino::Utf8PathBuf;
+
+use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
+use tokio::{
+    fs::OpenOptions,
+    io::{AsyncSeekExt, AsyncWriteExt},
+};
+use tracing::{info, warn};
+use utils::{id::TenantTimelineId, lsn::Lsn};
+
+use crate::{
+    control_file::{FileStorage, Storage},
+    pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline},
+    safekeeper::SafeKeeperState,
+    timeline::{Timeline, TimelineError},
+    wal_backup::copy_s3_segments,
+    wal_storage::{wal_file_paths, WalReader},
+    GlobalTimelines, SafeKeeperConf,
+};
+
+// we don't want to have more than 10 segments on disk after copy, because they take space
+const MAX_BACKUP_LAG: u64 = 10 * WAL_SEGMENT_SIZE as u64;
+
+pub struct Request {
+    pub source: Arc<Timeline>,
+    pub until_lsn: Lsn,
+    pub destination_ttid: TenantTimelineId,
+}
+
+pub async fn handle_request(request: Request) -> Result<()> {
+    // TODO: request.until_lsn MUST be a valid LSN, and we cannot check it :(
+    //   if LSN will point to the middle of a WAL record, timeline will be in "broken" state
+
+    match GlobalTimelines::get(request.destination_ttid) {
+        // timeline already exists. would be good to check that this timeline is the copy
+        // of the source timeline, but it isn't obvious how to do that
+        Ok(_) => return Ok(()),
+        // timeline not found, we are going to create it
+        Err(TimelineError::NotFound(_)) => {}
+        // error, probably timeline was deleted
+        res => {
+            res?;
+        }
+    }
+
+    let conf = &GlobalTimelines::get_global_config();
+    let ttid = request.destination_ttid;
+
+    let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
+
+    let (mem_state, state) = request.source.get_state().await;
+    let start_lsn = state.timeline_start_lsn;
+    if start_lsn == Lsn::INVALID {
+        bail!("timeline is not initialized");
+    }
+    let backup_lsn = mem_state.backup_lsn;
+
+    {
+        let commit_lsn = mem_state.commit_lsn;
+        let flush_lsn = request.source.get_flush_lsn().await;
+
+        info!(
+            "collected info about source timeline: start_lsn={}, backup_lsn={}, commit_lsn={}, flush_lsn={}",
+            start_lsn, backup_lsn, commit_lsn, flush_lsn
+        );
+
+        assert!(backup_lsn >= start_lsn);
+        assert!(commit_lsn >= start_lsn);
+        assert!(flush_lsn >= start_lsn);
+
+        if request.until_lsn > flush_lsn {
+            bail!("requested LSN is beyond the end of the timeline");
+        }
+        if request.until_lsn < start_lsn {
+            bail!("requested LSN is before the start of the timeline");
+        }
+
+        if request.until_lsn > commit_lsn {
+            warn!("copy_timeline WAL is not fully committed");
+        }
+
+        if backup_lsn < request.until_lsn && request.until_lsn.0 - backup_lsn.0 > MAX_BACKUP_LAG {
+            // we have a lot of segments that are not backed up. we can try to wait here until
+            // segments will be backed up to remote storage, but it's not clear how long to wait
+            bail!("too many segments are not backed up");
+        }
+    }
+
+    let wal_seg_size = state.server.wal_seg_size as usize;
+    if wal_seg_size == 0 {
+        bail!("wal_seg_size is not set");
+    }
+
+    let first_segment = start_lsn.segment_number(wal_seg_size);
+    let last_segment = request.until_lsn.segment_number(wal_seg_size);
+
+    let new_backup_lsn = {
+        // we can't have new backup_lsn greater than existing backup_lsn or start of the last segment
+        let max_backup_lsn = backup_lsn.min(Lsn(last_segment * wal_seg_size as u64));
+
+        if max_backup_lsn <= start_lsn {
+            // probably we are starting from the first segment, which was not backed up yet.
+            // note that start_lsn can be in the middle of the segment
+            start_lsn
+        } else {
+            // we have some segments backed up, so we will assume all WAL below max_backup_lsn is backed up
+            assert!(max_backup_lsn.segment_offset(wal_seg_size) == 0);
+            max_backup_lsn
+        }
+    };
+
+    // all previous segments will be copied inside S3
+    let first_ondisk_segment = new_backup_lsn.segment_number(wal_seg_size);
+    assert!(first_ondisk_segment <= last_segment);
+    assert!(first_ondisk_segment >= first_segment);
+
+    copy_s3_segments(
+        wal_seg_size,
+        &request.source.ttid,
+        &request.destination_ttid,
+        first_segment,
+        first_ondisk_segment,
+    )
+    .await?;
+
+    copy_disk_segments(
+        conf,
+        &state,
+        wal_seg_size,
+        &request.source.ttid,
+        new_backup_lsn,
+        request.until_lsn,
+        &tli_dir_path,
+    )
+    .await?;
+
+    let mut new_state = SafeKeeperState::new(
+        &request.destination_ttid,
+        state.server.clone(),
+        vec![],
+        request.until_lsn,
+        start_lsn,
+    );
+    new_state.timeline_start_lsn = start_lsn;
+    new_state.peer_horizon_lsn = request.until_lsn;
+    new_state.backup_lsn = new_backup_lsn;
+
+    let mut file_storage = FileStorage::create_new(tli_dir_path.clone(), conf, new_state.clone())?;
+    file_storage.persist(&new_state).await?;
+
+    // now we have a ready timeline in a temp directory
+    validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
+    load_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
+
+    Ok(())
+}
+
+async fn copy_disk_segments(
+    conf: &SafeKeeperConf,
+    persisted_state: &SafeKeeperState,
+    wal_seg_size: usize,
+    source_ttid: &TenantTimelineId,
+    start_lsn: Lsn,
+    end_lsn: Lsn,
+    tli_dir_path: &Utf8PathBuf,
+) -> Result<()> {
+    let mut wal_reader = WalReader::new(
+        conf.workdir.clone(),
+        conf.timeline_dir(source_ttid),
+        persisted_state,
+        start_lsn,
+        true,
+    )?;
+
+    let mut buf = [0u8; MAX_SEND_SIZE];
+
+    let first_segment = start_lsn.segment_number(wal_seg_size);
+    let last_segment = end_lsn.segment_number(wal_seg_size);
+
+    for segment in first_segment..=last_segment {
+        let segment_start = segment * wal_seg_size as u64;
+        let segment_end = segment_start + wal_seg_size as u64;
+
+        let copy_start = segment_start.max(start_lsn.0);
+        let copy_end = segment_end.min(end_lsn.0);
+
+        let copy_start = copy_start - segment_start;
+        let copy_end = copy_end - segment_start;
+
+        let wal_file_path = {
+            let (normal, partial) = wal_file_paths(tli_dir_path, segment, wal_seg_size)?;
+
+            if segment == last_segment {
+                partial
+            } else {
+                normal
+            }
+        };
+
+        write_segment(
+            &mut buf,
+            &wal_file_path,
+            wal_seg_size as u64,
+            copy_start,
+            copy_end,
+            &mut wal_reader,
+        )
+        .await?;
+    }
+
+    Ok(())
+}
+
+async fn write_segment(
+    buf: &mut [u8],
+    file_path: &Utf8PathBuf,
+    wal_seg_size: u64,
+    from: u64,
+    to: u64,
+    reader: &mut WalReader,
+) -> Result<()> {
+    assert!(from <= to);
+    assert!(to <= wal_seg_size);
+
+    let mut file = OpenOptions::new()
+        .create(true)
+        .write(true)
+        .open(&file_path)
+        .await?;
+
+    // maybe fill with zeros, as in wal_storage.rs?
+    file.set_len(wal_seg_size).await?;
+    file.seek(std::io::SeekFrom::Start(from)).await?;
+
+    let mut bytes_left = to - from;
+    while bytes_left > 0 {
+        let len = bytes_left as usize;
+        let len = len.min(buf.len());
+        let len = reader.read(&mut buf[..len]).await?;
+        file.write_all(&buf[..len]).await?;
+        bytes_left -= len as u64;
+    }
+
+    file.flush().await?;
+    file.sync_all().await?;
+    Ok(())
+}
diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs
index daf9255ecb..c9ff1afdea 100644
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -7,13 +7,16 @@ use std::io::Read;
 use std::path::PathBuf;
 use std::sync::Arc;
 
+use anyhow::bail;
 use anyhow::Result;
 use camino::Utf8Path;
 use chrono::{DateTime, Utc};
 use postgres_ffi::XLogSegNo;
+use postgres_ffi::MAX_SEND_SIZE;
 use serde::Deserialize;
 use serde::Serialize;
 
+use sha2::{Digest, Sha256};
 use utils::id::NodeId;
 use utils::id::TenantTimelineId;
 use utils::id::{TenantId, TimelineId};
@@ -25,6 +28,7 @@ use crate::safekeeper::TermHistory;
 use crate::SafeKeeperConf;
 
 use crate::send_wal::WalSenderState;
+use crate::wal_storage::WalReader;
 use crate::GlobalTimelines;
 
 /// Various filters that influence the resulting JSON output.
@@ -300,3 +304,56 @@ fn build_config(config: SafeKeeperConf) -> Config {
         wal_backup_enabled: config.wal_backup_enabled,
     }
 }
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TimelineDigestRequest {
+    pub from_lsn: Lsn,
+    pub until_lsn: Lsn,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct TimelineDigest {
+    pub sha256: String,
+}
+
+pub async fn calculate_digest(
+    tli: &Arc<crate::timeline::Timeline>,
+    request: TimelineDigestRequest,
+) -> Result<TimelineDigest> {
+    if request.from_lsn > request.until_lsn {
+        bail!("from_lsn is greater than until_lsn");
+    }
+
+    let conf = GlobalTimelines::get_global_config();
+    let (_, persisted_state) = tli.get_state().await;
+
+    if persisted_state.timeline_start_lsn > request.from_lsn {
+        bail!("requested LSN is before the start of the timeline");
+    }
+
+    let mut wal_reader = WalReader::new(
+        conf.workdir.clone(),
+        tli.timeline_dir.clone(),
+        &persisted_state,
+        request.from_lsn,
+        true,
+    )?;
+
+    let mut hasher = Sha256::new();
+    let mut buf = [0u8; MAX_SEND_SIZE];
+
+    let mut bytes_left = (request.until_lsn.0 - request.from_lsn.0) as usize;
+    while bytes_left > 0 {
+        let bytes_to_read = std::cmp::min(buf.len(), bytes_left);
+        let bytes_read = wal_reader.read(&mut buf[..bytes_to_read]).await?;
+        if bytes_read == 0 {
+            bail!("wal_reader.read returned 0 bytes");
+        }
+        hasher.update(&buf[..bytes_read]);
+        bytes_left -= bytes_read;
+    }
+
+    let digest = hasher.finalize();
+    let digest = hex::encode(digest);
+    Ok(TimelineDigest { sha256: digest })
+}
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 25a3334e63..5283ea19c1 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -2,7 +2,7 @@ use hyper::{Body, Request, Response, StatusCode, Uri};
 
 use once_cell::sync::Lazy;
 use postgres_ffi::WAL_SEGMENT_SIZE;
-use safekeeper_api::models::SkTimelineInfo;
+use safekeeper_api::models::{SkTimelineInfo, TimelineCopyRequest};
 use serde::{Deserialize, Serialize};
 use std::collections::{HashMap, HashSet};
 use std::fmt;
@@ -14,19 +14,21 @@ use tokio::fs::File;
 use tokio::io::AsyncReadExt;
 use tokio_util::sync::CancellationToken;
 use utils::failpoint_support::failpoints_handler;
+use utils::http::request::parse_query_param;
 
 use std::io::Write as _;
 use tokio::sync::mpsc;
 use tokio_stream::wrappers::ReceiverStream;
-use tracing::info_span;
+use tracing::{info_span, Instrument};
 use utils::http::endpoint::{request_span, ChannelWriter};
 
+use crate::debug_dump::TimelineDigestRequest;
 use crate::receive_wal::WalReceiverState;
 use crate::safekeeper::Term;
 use crate::safekeeper::{ServerInfo, TermLsn};
 use crate::send_wal::WalSenderState;
 use crate::timeline::PeerInfo;
-use crate::{debug_dump, pull_timeline};
+use crate::{copy_timeline, debug_dump, pull_timeline};
 
 use crate::timelines_global_map::TimelineDeleteForceResult;
 use crate::GlobalTimelines;
@@ -204,6 +206,56 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
     json_response(StatusCode::OK, resp)
 }
 
+async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+
+    let request_data: TimelineCopyRequest = json_request(&mut request).await?;
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "source_timeline_id")?,
+    );
+
+    let source = GlobalTimelines::get(ttid)?;
+
+    copy_timeline::handle_request(copy_timeline::Request{
+        source,
+        until_lsn: request_data.until_lsn,
+        destination_ttid: TenantTimelineId::new(ttid.tenant_id, request_data.target_timeline_id),
+    })
+        .instrument(info_span!("copy_timeline", from=%ttid, to=%request_data.target_timeline_id, until_lsn=%request_data.until_lsn))
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
+async fn timeline_digest_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+    check_permission(&request, Some(ttid.tenant_id))?;
+
+    let from_lsn: Option<Lsn> = parse_query_param(&request, "from_lsn")?;
+    let until_lsn: Option<Lsn> = parse_query_param(&request, "until_lsn")?;
+
+    let request = TimelineDigestRequest {
+        from_lsn: from_lsn.ok_or(ApiError::BadRequest(anyhow::anyhow!(
+            "from_lsn is required"
+        )))?,
+        until_lsn: until_lsn.ok_or(ApiError::BadRequest(anyhow::anyhow!(
+            "until_lsn is required"
+        )))?,
+    };
+
+    let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
+
+    let response = debug_dump::calculate_digest(&tli, request)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+    json_response(StatusCode::OK, response)
+}
+
 /// Download a file from the timeline directory.
 // TODO: figure out a better way to copy files between safekeepers
 async fn timeline_files_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -472,11 +524,18 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
             "/v1/tenant/:tenant_id/timeline/:timeline_id/file/:filename",
             |r| request_span(r, timeline_files_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
+            |r| request_span(r, timeline_copy_handler),
+        )
         // for tests
         .post("/v1/record_safekeeper_info/:tenant_id/:timeline_id", |r| {
             request_span(r, record_safekeeper_info)
         })
         .get("/v1/debug_dump", |r| request_span(r, dump_debug_handler))
+        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/digest", |r| {
+            request_span(r, timeline_digest_handler)
+        })
 }
 
 #[cfg(test)]
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 3a086f1f54..fc5f99eb00 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -16,6 +16,7 @@ mod auth;
 pub mod broker;
 pub mod control_file;
 pub mod control_file_upgrade;
+pub mod copy_timeline;
 pub mod debug_dump;
 pub mod handler;
 pub mod http;
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index ad3a18a536..93b51f32c0 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -1,16 +1,24 @@
+use std::sync::Arc;
+
+use camino::Utf8PathBuf;
+use camino_tempfile::Utf8TempDir;
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize};
 
 use anyhow::{bail, Context, Result};
 use tokio::io::AsyncWriteExt;
 use tracing::info;
-use utils::id::{TenantId, TenantTimelineId, TimelineId};
+use utils::{
+    id::{TenantId, TenantTimelineId, TimelineId},
+    lsn::Lsn,
+};
 
 use crate::{
     control_file, debug_dump,
     http::routes::TimelineStatus,
+    timeline::{Timeline, TimelineError},
     wal_storage::{self, Storage},
-    GlobalTimelines,
+    GlobalTimelines, SafeKeeperConf,
 };
 
 /// Info about timeline on safekeeper ready for reporting.
@@ -91,7 +99,7 @@ pub async fn handle_request(request: Request) -> Result<Response> {
 async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response> {
     let ttid = TenantTimelineId::new(status.tenant_id, status.timeline_id);
     info!(
-        "Pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}",
+        "pulling timeline {} from safekeeper {}, commit_lsn={}, flush_lsn={}, term={}, epoch={}",
         ttid,
         host,
         status.commit_lsn,
@@ -121,14 +129,14 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
 
     if dump.timelines.len() != 1 {
         bail!(
-            "Expected to fetch single timeline, got {} timelines",
+            "expected to fetch single timeline, got {} timelines",
             dump.timelines.len()
         );
     }
 
     let timeline = dump.timelines.into_iter().next().unwrap();
     let disk_content = timeline.disk_content.ok_or(anyhow::anyhow!(
-        "Timeline {} doesn't have disk content",
+        "timeline {} doesn't have disk content",
         ttid
     ))?;
 
@@ -155,29 +163,12 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
     filenames.insert(0, "safekeeper.control".to_string());
 
     info!(
-        "Downloading {} files from safekeeper {}",
+        "downloading {} files from safekeeper {}",
         filenames.len(),
         host
     );
 
-    // Creating temp directory for a new timeline. It needs to be
-    // located on the same filesystem as the rest of the timelines.
-
-    // conf.workdir is usually /storage/safekeeper/data
-    // will try to transform it into /storage/safekeeper/tmp
-    let temp_base = conf
-        .workdir
-        .parent()
-        .ok_or(anyhow::anyhow!("workdir has no parent"))?
-        .join("tmp");
-
-    tokio::fs::create_dir_all(&temp_base).await?;
-
-    let tli_dir = camino_tempfile::Builder::new()
-        .suffix("_temptli")
-        .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
-        .tempdir_in(temp_base)?;
-    let tli_dir_path = tli_dir.path().to_path_buf();
+    let (_tmp_dir, tli_dir_path) = create_temp_timeline_dir(conf, ttid).await?;
 
     // Note: some time happens between fetching list of files and fetching files themselves.
     //       It's possible that some files will be removed from safekeeper and we will fail to fetch them.
@@ -201,47 +192,105 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
     // TODO: fsync?
 
     // Let's create timeline from temp directory and verify that it's correct
+    let (commit_lsn, flush_lsn) = validate_temp_timeline(conf, ttid, &tli_dir_path).await?;
+    info!(
+        "finished downloading timeline {}, commit_lsn={}, flush_lsn={}",
+        ttid, commit_lsn, flush_lsn
+    );
+    assert!(status.commit_lsn <= status.flush_lsn);
 
-    let control_path = tli_dir_path.join("safekeeper.control");
+    // Finally, load the timeline.
+    let _tli = load_temp_timeline(conf, ttid, &tli_dir_path).await?;
+
+    Ok(Response {
+        safekeeper_host: host,
+    })
+}
+
+/// Create temp directory for a new timeline. It needs to be located on the same
+/// filesystem as the rest of the timelines. It will be automatically deleted when
+/// Utf8TempDir goes out of scope.
+pub async fn create_temp_timeline_dir(
+    conf: &SafeKeeperConf,
+    ttid: TenantTimelineId,
+) -> Result<(Utf8TempDir, Utf8PathBuf)> {
+    // conf.workdir is usually /storage/safekeeper/data
+    // will try to transform it into /storage/safekeeper/tmp
+    let temp_base = conf
+        .workdir
+        .parent()
+        .ok_or(anyhow::anyhow!("workdir has no parent"))?
+        .join("tmp");
+
+    tokio::fs::create_dir_all(&temp_base).await?;
+
+    let tli_dir = camino_tempfile::Builder::new()
+        .suffix("_temptli")
+        .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
+        .tempdir_in(temp_base)?;
+
+    let tli_dir_path = tli_dir.path().to_path_buf();
+
+    Ok((tli_dir, tli_dir_path))
+}
+
+/// Do basic validation of a temp timeline, before moving it to the global map.
+pub async fn validate_temp_timeline(
+    conf: &SafeKeeperConf,
+    ttid: TenantTimelineId,
+    path: &Utf8PathBuf,
+) -> Result<(Lsn, Lsn)> {
+    let control_path = path.join("safekeeper.control");
 
     let control_store = control_file::FileStorage::load_control_file(control_path)?;
     if control_store.server.wal_seg_size == 0 {
         bail!("wal_seg_size is not set");
     }
 
-    let wal_store =
-        wal_storage::PhysicalStorage::new(&ttid, tli_dir_path.clone(), conf, &control_store)?;
+    let wal_store = wal_storage::PhysicalStorage::new(&ttid, path.clone(), conf, &control_store)?;
 
-    let commit_lsn = status.commit_lsn;
+    let commit_lsn = control_store.commit_lsn;
     let flush_lsn = wal_store.flush_lsn();
 
-    info!(
-        "Finished downloading timeline {}, commit_lsn={}, flush_lsn={}",
-        ttid, commit_lsn, flush_lsn
-    );
-    assert!(status.commit_lsn <= status.flush_lsn);
+    Ok((commit_lsn, flush_lsn))
+}
+
+/// Move timeline from a temp directory to the main storage, and load it to the global map.
+/// This operation is done under a lock to prevent bugs if several concurrent requests are
+/// trying to load the same timeline. Note that it doesn't guard against creating the
+/// timeline with the same ttid, but no one should be doing this anyway.
+pub async fn load_temp_timeline(
+    conf: &SafeKeeperConf,
+    ttid: TenantTimelineId,
+    tmp_path: &Utf8PathBuf,
+) -> Result<Arc<Timeline>> {
+    // Take a lock to prevent concurrent loadings
+    let load_lock = GlobalTimelines::loading_lock().await;
+    let guard = load_lock.lock().await;
+
+    if !matches!(GlobalTimelines::get(ttid), Err(TimelineError::NotFound(_))) {
+        bail!("timeline already exists, cannot overwrite it")
+    }
 
     // Move timeline dir to the correct location
     let timeline_path = conf.timeline_dir(&ttid);
 
     info!(
-        "Moving timeline {} from {} to {}",
-        ttid, tli_dir_path, timeline_path
+        "moving timeline {} from {} to {}",
+        ttid, tmp_path, timeline_path
     );
     tokio::fs::create_dir_all(conf.tenant_dir(&ttid.tenant_id)).await?;
-    tokio::fs::rename(tli_dir_path, &timeline_path).await?;
+    tokio::fs::rename(tmp_path, &timeline_path).await?;
 
-    let tli = GlobalTimelines::load_timeline(ttid)
+    let tli = GlobalTimelines::load_timeline(&guard, ttid)
         .await
         .context("Failed to load timeline after copy")?;
 
     info!(
-        "Loaded timeline {}, flush_lsn={}",
+        "loaded timeline {}, flush_lsn={}",
         ttid,
         tli.get_flush_lsn().await
     );
 
-    Ok(Response {
-        safekeeper_host: host,
-    })
+    Ok(tli)
 }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index bdc9088138..2f284abe8c 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -141,7 +141,8 @@ impl SharedState {
 
         // We don't want to write anything to disk, because we may have existing timeline there.
         // These functions should not change anything on disk.
-        let control_store = control_file::FileStorage::create_new(ttid, conf, state)?;
+        let timeline_dir = conf.timeline_dir(ttid);
+        let control_store = control_file::FileStorage::create_new(timeline_dir, conf, state)?;
         let wal_store =
             wal_storage::PhysicalStorage::new(ttid, conf.timeline_dir(ttid), conf, &control_store)?;
         let sk = SafeKeeper::new(control_store, wal_store, conf.my_id)?;
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index cbb3342e40..92ac5ba66d 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -21,8 +21,12 @@ struct GlobalTimelinesState {
     timelines: HashMap<TenantTimelineId, Arc<Timeline>>,
     wal_backup_launcher_tx: Option<Sender<TenantTimelineId>>,
     conf: Option<SafeKeeperConf>,
+    load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
 }
 
+// Used to prevent concurrent timeline loading.
+pub struct TimelineLoadLock;
+
 impl GlobalTimelinesState {
     /// Get configuration, which must be set once during init.
     fn get_conf(&self) -> &SafeKeeperConf {
@@ -63,6 +67,7 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
         timelines: HashMap::new(),
         wal_backup_launcher_tx: None,
         conf: None,
+        load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
     })
 });
 
@@ -174,8 +179,16 @@ impl GlobalTimelines {
         Ok(())
     }
 
+    /// Take a lock for timeline loading.
+    pub async fn loading_lock() -> Arc<tokio::sync::Mutex<TimelineLoadLock>> {
+        TIMELINES_STATE.lock().unwrap().load_lock.clone()
+    }
+
     /// Load timeline from disk to the memory.
-    pub async fn load_timeline(ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
+    pub async fn load_timeline<'a>(
+        _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>,
+        ttid: TenantTimelineId,
+    ) -> Result<Arc<Timeline>> {
         let (conf, wal_backup_launcher_tx) = TIMELINES_STATE.lock().unwrap().get_dependencies();
 
         match Timeline::load_timeline(&conf, ttid, wal_backup_launcher_tx) {
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index c99bbc7d61..e4499eaf50 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -7,7 +7,7 @@ use tokio::task::JoinHandle;
 use utils::id::NodeId;
 
 use std::cmp::min;
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::pin::Pin;
 use std::sync::Arc;
 use std::time::Duration;
@@ -531,3 +531,62 @@ pub async fn read_object(
 
     Ok(Box::pin(reader))
 }
+
+/// Copy segments from one timeline to another. Used in copy_timeline.
+pub async fn copy_s3_segments(
+    wal_seg_size: usize,
+    src_ttid: &TenantTimelineId,
+    dst_ttid: &TenantTimelineId,
+    from_segment: XLogSegNo,
+    to_segment: XLogSegNo,
+) -> Result<()> {
+    const SEGMENTS_PROGRESS_REPORT_INTERVAL: u64 = 1024;
+
+    let storage = REMOTE_STORAGE
+        .get()
+        .expect("failed to get remote storage")
+        .as_ref()
+        .unwrap();
+
+    let relative_dst_path =
+        Utf8Path::new(&dst_ttid.tenant_id.to_string()).join(dst_ttid.timeline_id.to_string());
+
+    let remote_path = RemotePath::new(&relative_dst_path)?;
+
+    let files = storage.list_files(Some(&remote_path)).await?;
+    let uploaded_segments = &files
+        .iter()
+        .filter_map(|file| file.object_name().map(ToOwned::to_owned))
+        .collect::<HashSet<_>>();
+
+    debug!(
+        "these segments have already been uploaded: {:?}",
+        uploaded_segments
+    );
+
+    let relative_src_path =
+        Utf8Path::new(&src_ttid.tenant_id.to_string()).join(src_ttid.timeline_id.to_string());
+
+    for segno in from_segment..to_segment {
+        if segno % SEGMENTS_PROGRESS_REPORT_INTERVAL == 0 {
+            info!("copied all segments from {} until {}", from_segment, segno);
+        }
+
+        let segment_name = XLogFileName(PG_TLI, segno, wal_seg_size);
+        if uploaded_segments.contains(&segment_name) {
+            continue;
+        }
+        debug!("copying segment {}", segment_name);
+
+        let from = RemotePath::new(&relative_src_path.join(&segment_name))?;
+        let to = RemotePath::new(&relative_dst_path.join(&segment_name))?;
+
+        storage.copy_object(&from, &to).await?;
+    }
+
+    info!(
+        "finished copying segments from {} until {}",
+        from_segment, to_segment
+    );
+    Ok(())
+}
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index e7538f805c..8d138c701f 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -728,7 +728,7 @@ async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
 }
 
 /// Helper returning full path to WAL segment file and its .partial brother.
-fn wal_file_paths(
+pub fn wal_file_paths(
     timeline_dir: &Utf8Path,
     segno: XLogSegNo,
     wal_seg_size: usize,
diff --git a/scripts/export_import_between_pageservers.py b/scripts/export_import_between_pageservers.py
index ff584bd4b0..980f343047 100755
--- a/scripts/export_import_between_pageservers.py
+++ b/scripts/export_import_between_pageservers.py
@@ -63,7 +63,7 @@ def subprocess_capture(capture_dir: str, cmd: List[str], **kwargs: Any) -> str:
     If those files already exist, we will overwrite them.
     Returns basepath for files with captured output.
     """
-    assert type(cmd) is list
+    assert isinstance(cmd, list)
     base = os.path.basename(cmd[0]) + "_{}".format(global_counter())
     basepath = os.path.join(capture_dir, base)
     stdout_filename = basepath + ".stdout"
diff --git a/scripts/reformat b/scripts/reformat
index 8688044f66..3533c4dcb8 100755
--- a/scripts/reformat
+++ b/scripts/reformat
@@ -6,5 +6,5 @@ set -euox pipefail
 echo 'Reformatting Rust code'
 cargo fmt
 echo 'Reformatting Python code'
-poetry run ruff --fix test_runner scripts
-poetry run black test_runner scripts
+poetry run ruff check --fix test_runner scripts
+poetry run ruff format test_runner scripts
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5b1a8ba27d..001d4e23a9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1101,8 +1101,8 @@ class AbstractNeonCli(abc.ABC):
         If `local_binpath` is true, then we are invoking a test utility
         """
 
-        assert type(arguments) == list
-        assert type(self.COMMAND) == str
+        assert isinstance(arguments, list)
+        assert isinstance(self.COMMAND, str)
 
         if local_binpath:
             # Test utility
@@ -3032,6 +3032,28 @@ class SafekeeperHttpClient(requests.Session):
         assert isinstance(res_json, dict)
         return res_json
 
+    def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: Dict[str, Any]):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
+            json=body,
+        )
+        res.raise_for_status()
+
+    def timeline_digest(
+        self, tenant_id: TenantId, timeline_id: TimelineId, from_lsn: Lsn, until_lsn: Lsn
+    ) -> Dict[str, Any]:
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/digest",
+            params={
+                "from_lsn": str(from_lsn),
+                "until_lsn": str(until_lsn),
+            },
+        )
+        res.raise_for_status()
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
     def timeline_create(
         self,
         tenant_id: TenantId,
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index add6c4288a..a779dcc436 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -326,6 +326,10 @@ class PageserverHttpClient(requests.Session):
         res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/heatmap_upload")
         self.verbose_error(res)
 
+    def tenant_secondary_download(self, tenant_id: TenantId):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/secondary/download")
+        self.verbose_error(res)
+
     def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
         assert "tenant_id" not in config.keys()
         res = self.put(
@@ -361,9 +365,9 @@ class PageserverHttpClient(requests.Session):
         assert isinstance(res, dict)
         assert TenantId(res["id"]) == tenant_id
         size = res["size"]
-        assert type(size) == int
+        assert isinstance(size, int)
         inputs = res["inputs"]
-        assert type(inputs) is dict
+        assert isinstance(inputs, dict)
         return (size, inputs)
 
     def tenant_size_debug(self, tenant_id: TenantId) -> str:
@@ -714,7 +718,7 @@ class PageserverHttpClient(requests.Session):
         )
         self.verbose_error(res)
 
-        assert res.status_code == 200
+        assert res.status_code in (200, 304)
 
     def evict_all_layers(self, tenant_id: TenantId, timeline_id: TimelineId):
         info = self.layer_map_info(tenant_id, timeline_id)
diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py
index 1e6e9a0174..8a9509ea44 100644
--- a/test_runner/performance/test_perf_olap.py
+++ b/test_runner/performance/test_perf_olap.py
@@ -42,9 +42,10 @@ def test_clickbench_create_pg_stat_statements(remote_compare: RemoteCompare):
 # Please do not alter the label for the query, as it is used to identify it.
 # Labels for ClickBench queries match the labels in ClickBench reports
 # on https://benchmark.clickhouse.com/ (the DB size may differ).
+#
+# Disable auto formatting for the list of queries so that it's easier to read
+# fmt: off
 QUERIES: Tuple[LabelledQuery, ...] = (
-    # Disable `black` formatting for the list of queries so that it's easier to read
-    # fmt: off
     ### ClickBench queries:
     LabelledQuery("Q0",  r"SELECT COUNT(*) FROM hits;"),
     LabelledQuery("Q1",  r"SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;"),
@@ -96,8 +97,8 @@ QUERIES: Tuple[LabelledQuery, ...] = (
     # LabelledQuery("NQ0", r"..."),
     # LabelledQuery("NQ1", r"..."),
     # ...
-    # fmt: on
 )
+# fmt: on
 
 EXPLAIN_STRING: str = "EXPLAIN (ANALYZE, VERBOSE, BUFFERS, COSTS, SETTINGS, FORMAT JSON)"
 
diff --git a/test_runner/performance/test_wal_backpressure.py b/test_runner/performance/test_wal_backpressure.py
index 3cb4b667ff..7eb244d378 100644
--- a/test_runner/performance/test_wal_backpressure.py
+++ b/test_runner/performance/test_wal_backpressure.py
@@ -32,8 +32,7 @@ def pg_compare(request) -> PgCompare:
     else:
         assert (
             len(x) == 2
-        ), f"request param ({request.param}) should have a format of \
-        `neon_{{safekeepers_enable_fsync}}`"
+        ), f"request param ({request.param}) should have a format of `neon_{{safekeepers_enable_fsync}}`"
 
         # `NeonCompare` interface
         neon_env_builder = request.getfixturevalue("neon_env_builder")
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 32397bbcc1..ed389b1aa2 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -194,12 +194,13 @@ def test_fully_custom_config(positive_env: NeonEnv):
     assert set(our_tenant_config.effective_config.keys()) == set(
         fully_custom_config.keys()
     ), "ensure we cover all config options"
-    assert {
-        k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k]
-        for k in fully_custom_config.keys()
-    } == {
-        k: True for k in fully_custom_config.keys()
-    }, "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
+    assert (
+        {
+            k: initial_tenant_config.effective_config[k] != our_tenant_config.effective_config[k]
+            for k in fully_custom_config.keys()
+        }
+        == {k: True for k in fully_custom_config.keys()}
+    ), "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
 
     ps_http.tenant_detach(tenant_id)
     env.pageserver.tenant_attach(tenant_id, config=fully_custom_config)
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 5a9c2782e6..f9d6d0a934 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -186,9 +186,7 @@ def test_backward_compatibility(
         else:
             raise
 
-    assert (
-        not breaking_changes_allowed
-    ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
+    assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
 
 
 @check_ondisk_data_compatibility_if_enabled
@@ -247,9 +245,7 @@ def test_forward_compatibility(
         else:
             raise
 
-    assert (
-        not breaking_changes_allowed
-    ), "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
+    assert not breaking_changes_allowed, "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
 
 
 def check_neon_works(env: NeonEnv, test_output_dir: Path, sql_dump_path: Path, repo_dir: Path):
diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py
index 7ec901af34..01ecc2b95f 100644
--- a/test_runner/regress/test_crafted_wal_end.py
+++ b/test_runner/regress/test_crafted_wal_end.py
@@ -2,7 +2,6 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft
 
-
 # Restart nodes with WAL end having specially crafted shape, like last record
 # crossing segment boundary, to test decoding issues.
 
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index 2cd2406065..efba2033fb 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -102,9 +102,7 @@ def test_basic_eviction(
         ), f"Did not expect to find {local_layer} layer after evicting"
 
     empty_layers = list(filter(lambda path: path.name != "metadata", timeline_path.glob("*")))
-    assert (
-        not empty_layers
-    ), f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}"
+    assert not empty_layers, f"After evicting all layers, timeline {tenant_id}/{timeline_id} should have no layers locally, but got: {empty_layers}"
 
     evicted_layer_map_info = client.layer_map_info(tenant_id=tenant_id, timeline_id=timeline_id)
     assert (
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 340188c1ae..999e077e45 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -38,6 +38,9 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
     env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.extend(
+        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
+    )
 
     ps_http = env.pageserver.http_client()
 
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 573d2139ce..e29db1e252 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -145,8 +145,7 @@ def expect_updated_msg_lsn(
     last_msg_lsn = Lsn(timeline_details["last_received_msg_lsn"])
     assert (
         prev_msg_lsn is None or prev_msg_lsn < last_msg_lsn
-    ), f"the last received message's LSN {last_msg_lsn} hasn't been updated \
-        compared to the previous message's LSN {prev_msg_lsn}"
+    ), f"the last received message's LSN {last_msg_lsn} hasn't been updated compared to the previous message's LSN {prev_msg_lsn}"
 
     return last_msg_lsn
 
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 8ae4297983..a9eff99a0c 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -1,9 +1,11 @@
 import random
+from pathlib import Path
 from typing import Any, Dict, Optional
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver
+from fixtures.neon_fixtures import NeonEnvBuilder, NeonPageserver, S3Scrubber
+from fixtures.pageserver.utils import assert_prefix_empty, tenant_delete_wait_completed
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind
 from fixtures.types import TenantId, TimelineId
 from fixtures.utils import wait_until
@@ -251,6 +253,9 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
         flush_ms=5000,
     )
 
+    # Encourage the new location to download while still in secondary mode
+    pageserver_b.http_client().tenant_secondary_download(tenant_id)
+
     migrated_generation = env.attachment_service.attach_hook_issue(tenant_id, pageserver_b.id)
     log.info(f"Acquired generation {migrated_generation} for destination pageserver")
     assert migrated_generation == initial_generation + 1
@@ -258,8 +263,6 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
     # Writes and reads still work in AttachedStale.
     workload.validate(pageserver_a.id)
 
-    # TODO: call into secondary mode API hooks to do an upload/download sync
-
     # Generate some more dirty writes: we expect the origin to ingest WAL in
     # in AttachedStale
     workload.churn_rows(64, pageserver_a.id, upload=False)
@@ -369,3 +372,143 @@ def test_heatmap_uploads(neon_env_builder: NeonEnvBuilder):
     log.info(f"Read back heatmap: {heatmap_second}")
     assert heatmap_second != heatmap_first
     validate_heatmap(heatmap_second)
+
+
+def list_layers(pageserver, tenant_id: TenantId, timeline_id: TimelineId) -> list[Path]:
+    """
+    Inspect local storage on a pageserver to discover which layer files are present.
+
+    :return: list of relative paths to layers, from the timeline root.
+    """
+    timeline_path = pageserver.timeline_dir(tenant_id, timeline_id)
+
+    def relative(p: Path) -> Path:
+        return p.relative_to(timeline_path)
+
+    return sorted(
+        list(
+            map(
+                relative,
+                filter(
+                    lambda path: path.name != "metadata"
+                    and "ephemeral" not in path.name
+                    and "temp" not in path.name,
+                    timeline_path.glob("*"),
+                ),
+            )
+        )
+    )
+
+
+def test_secondary_downloads(neon_env_builder: NeonEnvBuilder):
+    """
+    Test the overall data flow in secondary mode:
+     - Heatmap uploads from the attached location
+     - Heatmap & layer downloads from the secondary location
+     - Eviction of layers on the attached location results in deletion
+       on the secondary location as well.
+    """
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.enable_pageserver_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+    )
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+    assert env.attachment_service is not None
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    ps_attached = env.pageservers[0]
+    ps_secondary = env.pageservers[1]
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init(env.pageservers[0].id)
+    workload.write_rows(256, ps_attached.id)
+
+    # Configure a secondary location
+    log.info("Setting up secondary location...")
+    ps_secondary.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "Secondary",
+            "secondary_conf": {"warm": True},
+            "tenant_conf": {},
+        },
+    )
+    readback_conf = ps_secondary.read_tenant_location_conf(tenant_id)
+    log.info(f"Read back conf: {readback_conf}")
+
+    # Explicit upload/download cycle
+    # ==============================
+    log.info("Synchronizing after initial write...")
+    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+
+    ps_secondary.http_client().tenant_secondary_download(tenant_id)
+
+    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+        ps_secondary, tenant_id, timeline_id
+    )
+
+    # Make changes on attached pageserver, check secondary downloads them
+    # ===================================================================
+    log.info("Synchronizing after subsequent write...")
+    workload.churn_rows(128, ps_attached.id)
+
+    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+    ps_secondary.http_client().tenant_secondary_download(tenant_id)
+
+    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+        ps_secondary, tenant_id, timeline_id
+    )
+
+    # FIXME: this sleep is needed to avoid on-demand promotion of the layers we evict, while
+    # walreceiver is still doing something.
+    import time
+
+    time.sleep(5)
+
+    # Do evictions on attached pageserver, check secondary follows along
+    # ==================================================================
+    log.info("Evicting a layer...")
+    layer_to_evict = list_layers(ps_attached, tenant_id, timeline_id)[0]
+    ps_attached.http_client().evict_layer(tenant_id, timeline_id, layer_name=layer_to_evict.name)
+
+    log.info("Synchronizing after eviction...")
+    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
+    ps_secondary.http_client().tenant_secondary_download(tenant_id)
+
+    assert layer_to_evict not in list_layers(ps_attached, tenant_id, timeline_id)
+    assert list_layers(ps_attached, tenant_id, timeline_id) == list_layers(
+        ps_secondary, tenant_id, timeline_id
+    )
+
+    # Scrub the remote storage
+    # ========================
+    # This confirms that the scrubber isn't upset by the presence of the heatmap
+    S3Scrubber(neon_env_builder.test_output_dir, neon_env_builder).scan_metadata()
+
+    # Detach secondary and delete tenant
+    # ===================================
+    # This confirms that the heatmap gets cleaned up as well as other normal content.
+    log.info("Detaching secondary location...")
+    ps_secondary.tenant_location_configure(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+        },
+    )
+
+    log.info("Deleting tenant...")
+    tenant_delete_wait_completed(ps_attached.http_client(), tenant_id, 10)
+
+    assert_prefix_empty(
+        neon_env_builder,
+        prefix="/".join(
+            (
+                "tenants",
+                str(tenant_id),
+            )
+        ),
+    )
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 0dcbb23ad4..c6dbc77885 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -391,8 +391,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
     tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
     assert (
         tenant_id not in tenants_after_detach
-    ), f"Ignored and then detached tenant {tenant_id} \
-        should not be present in pageserver's memory"
+    ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory"
 
 
 # Creates a tenant, and detaches it with extra paremeter that forces ignored tenant detach.
@@ -430,8 +429,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
     tenants_after_detach = [tenant["id"] for tenant in client.tenant_list()]
     assert (
         tenant_id not in tenants_after_detach
-    ), f"Ignored and then detached tenant {tenant_id} \
-        should not be present in pageserver's memory"
+    ), f"Ignored and then detached tenant {tenant_id} should not be present in pageserver's memory"
 
 
 def test_detach_while_attaching(
@@ -817,9 +815,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
         if found_broken:
             break
         time.sleep(0.5)
-    assert (
-        found_broken
-    ), f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
+    assert found_broken, f"broken should still be in set, but it is not in the tenant state count: broken={broken}, broken_set={broken_set}"
 
     env.pageserver.tenant_load(env.initial_tenant)
 
@@ -837,6 +833,4 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
             break
         time.sleep(0.5)
 
-    assert (
-        found_active
-    ), f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
+    assert found_active, f"reloaded tenant should be active, and broken tenant set item removed: active={active}, broken_set={broken_set}"
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index dcd7232b1b..1887bca23b 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -161,12 +161,10 @@ def switch_pg_to_new_pageserver(
     files_before_detach = os.listdir(timeline_to_detach_local_path)
     assert (
         "metadata" in files_before_detach
-    ), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file,\
-            but got: {files_before_detach}"
+    ), f"Regular timeline {timeline_to_detach_local_path} should have the metadata file, but got: {files_before_detach}"
     assert (
         len(files_before_detach) >= 2
-    ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file,\
-            but got {files_before_detach}"
+    ), f"Regular timeline {timeline_to_detach_local_path} should have at least one layer file, but got {files_before_detach}"
 
     return timeline_to_detach_local_path
 
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 07fb6dc5ca..6f05d7f7cb 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -201,8 +201,8 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder):
         len(restored_timelines) == 1
     ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage"
     restored_timeline = restored_timelines[0]
-    assert restored_timeline["timeline_id"] == str(
-        timeline_id
+    assert (
+        restored_timeline["timeline_id"] == str(timeline_id)
     ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage"
 
     # Check that we had to retry the downloads
@@ -280,8 +280,8 @@ def test_tenant_redownloads_truncated_file_on_startup(
         len(restored_timelines) == 1
     ), f"Tenant {tenant_id} should have its timeline reattached after its layer is downloaded from the remote storage"
     retored_timeline = restored_timelines[0]
-    assert retored_timeline["timeline_id"] == str(
-        timeline_id
+    assert (
+        retored_timeline["timeline_id"] == str(timeline_id)
     ), f"Tenant {tenant_id} should have its old timeline {timeline_id} restored from the remote storage"
 
     # Request non-incremental logical size. Calculating it needs the layer file that
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index cf8df389c8..b4ce633531 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -566,7 +566,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
         f"Pageserver last_record_lsn={pageserver_lsn}; flush_lsn={last_lsn}; lag before replay is {lag / 1024}kb"
     )
 
-    endpoint.stop_and_destroy()
+    endpoint.stop()
     timeline_delete_wait_completed(ps_http, tenant_id, timeline_id)
 
     # Also delete and manually create timeline on safekeepers -- this tests
@@ -1838,3 +1838,83 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
     assert final_stats.get("START_REPLICATION", 0) >= 1
     # walproposer should connect to each safekeeper at least once
     assert final_stats.get("START_WAL_PUSH", 0) >= 3
+
+
+@pytest.mark.parametrize("insert_rows", [0, 100, 100000, 500000])
+def test_timeline_copy(neon_env_builder: NeonEnvBuilder, insert_rows: int):
+    target_percents = [10, 50, 90, 100]
+
+    neon_env_builder.num_safekeepers = 3
+    # we need remote storage that supports copy_object S3 API
+    neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.MOCK_S3)
+    env = neon_env_builder.init_start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create_start("main")
+
+    lsns = []
+
+    def remember_lsn():
+        lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+        lsns.append(lsn)
+        return lsn
+
+    # remember LSN right after timeline creation
+    lsn = remember_lsn()
+    log.info(f"LSN after timeline creation: {lsn}")
+
+    endpoint.safe_psql("create table t(key int, value text)")
+
+    timeline_status = env.safekeepers[0].http_client().timeline_status(tenant_id, timeline_id)
+    timeline_start_lsn = timeline_status.timeline_start_lsn
+    log.info(f"Timeline start LSN: {timeline_start_lsn}")
+
+    current_percent = 0.0
+    for new_percent in target_percents:
+        new_rows = insert_rows * (new_percent - current_percent) / 100
+        current_percent = new_percent
+
+        if new_rows == 0:
+            continue
+
+        endpoint.safe_psql(
+            f"insert into t select generate_series(1, {new_rows}), repeat('payload!', 10)"
+        )
+
+        # remember LSN right after reaching new_percent
+        lsn = remember_lsn()
+        log.info(f"LSN after inserting {new_rows} rows: {lsn}")
+
+    # TODO: would be also good to test cases where not all segments are uploaded to S3
+
+    for lsn in lsns:
+        new_timeline_id = TimelineId.generate()
+        log.info(f"Copying branch for LSN {lsn}, to timeline {new_timeline_id}")
+
+        orig_digest = (
+            env.safekeepers[0]
+            .http_client()
+            .timeline_digest(tenant_id, timeline_id, timeline_start_lsn, lsn)
+        )
+        log.info(f"Original digest: {orig_digest}")
+
+        for sk in env.safekeepers:
+            sk.http_client().copy_timeline(
+                tenant_id,
+                timeline_id,
+                {
+                    "target_timeline_id": str(new_timeline_id),
+                    "until_lsn": str(lsn),
+                },
+            )
+
+            new_digest = sk.http_client().timeline_digest(
+                tenant_id, new_timeline_id, timeline_start_lsn, lsn
+            )
+            log.info(f"Digest after timeline copy on safekeeper {sk.id}: {new_digest}")
+
+            assert orig_digest == new_digest
+
+    # TODO: test timelines can start after copy
diff --git a/vm-image-spec.yaml b/vm-image-spec.yaml
index 68be0b3617..704e3721d6 100644
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -167,22 +167,21 @@ build: |
       && apt-get update \
       && apt-get install -y \
           build-essential \
-          curl \
+          git \
           libevent-dev \
-          libssl-dev \
-          patchutils \
+          libtool \
           pkg-config
 
-  ENV PGBOUNCER_VERSION 1.21.0
-  ENV PGBOUNCER_GITPATH 1_21_0
+  # Note, we use pgbouncer from neondatabase/pgbouncer fork, which could contain extra commits.
+  # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
+  ENV PGBOUNCER_TAG pgbouncer_1_21_0-neon-1
   RUN set -e \
-      && curl -sfSL https://github.com/pgbouncer/pgbouncer/releases/download/pgbouncer_${PGBOUNCER_GITPATH}/pgbouncer-${PGBOUNCER_VERSION}.tar.gz -o pgbouncer-${PGBOUNCER_VERSION}.tar.gz \
-      && tar xzvf pgbouncer-${PGBOUNCER_VERSION}.tar.gz \
-      && cd pgbouncer-${PGBOUNCER_VERSION} \
-      && curl https://github.com/pgbouncer/pgbouncer/commit/a7b3c0a5f4caa9dbe92743d04cf1e28c4c05806c.patch | filterdiff --include a/src/server.c | patch -p1 \
+      && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/neondatabase/pgbouncer.git pgbouncer \
+      && cd pgbouncer \
+      && ./autogen.sh \
       && LDFLAGS=-static ./configure --prefix=/usr/local/pgbouncer --without-openssl \
-      && make -j $(nproc) \
-      && make install
+      && make -j $(nproc) dist_man_MANS= \
+      && make install dist_man_MANS=
 merge: |
   # tweak nofile limits
   RUN set -e \

From d424f2b7c81b7448729e06917e6d328d91910992 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 15 Jan 2024 09:36:22 +0000
Subject: [PATCH 109/209] empty commit so we can produce a merge commit


From 7234208b36fea736ee1204c5a69a34db8160825e Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 22 Jan 2024 09:14:30 +0000
Subject: [PATCH 110/209] bump shlex (#6421)

## Problem

https://rustsec.org/advisories/RUSTSEC-2024-0006

## Summary of changes

`cargo update -p shlex`

(cherry picked from commit 5559b169535b67850129173e694e5297a5a1a960)
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 75337481bb..952034a16b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5031,9 +5031,9 @@ dependencies = [
 
 [[package]]
 name = "shlex"
-version = "1.1.0"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "43b2853a4d09f215c24cc5489c992ce46052d359b5109343cbafbf26bc62f8a3"
+checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
 
 [[package]]
 name = "signal-hook"

From 299d9474c9e3abc67297d81ad301ca46aaf97535 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 22 Jan 2024 14:24:10 +0100
Subject: [PATCH 111/209] Proxy: fix gc (#6426)

## Problem

Gc currently doesn't work properly.

## Summary of changes

Change statement on running gc.
---
 proxy/src/cache/project_info.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 7af2118873..57d9e5289d 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -266,7 +266,7 @@ impl ProjectInfoCacheImpl {
             tokio::time::interval(self.config.gc_interval / (self.cache.shards().len()) as u32);
         loop {
             interval.tick().await;
-            if self.cache.len() <= self.config.size {
+            if self.cache.len() < self.config.size {
                 // If there are not too many entries, wait until the next gc cycle.
                 continue;
             }

From f0b2d4b0535fae693af1a7a18b2bb03e7ce243fb Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 22 Jan 2024 15:27:29 +0100
Subject: [PATCH 112/209] fixup(#6037): actually fix the issue, #6388 failed to
 do so (#6429)

Before this patch, the select! still retured immediately if `futs` was
empty. Must have tested a stale build in my manual testing of #6388.

(cherry picked from commit 15c0df4de7ee71b43526d9850b47c9107efe303e)
---
 pageserver/src/page_service.rs | 37 ++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 1013131a99..77ce9981f0 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -386,12 +386,18 @@ impl PageServerHandler {
 
     /// Future that completes when we need to shut down the connection.
     ///
-    /// Reasons for need to shut down are:
-    /// - any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
-    /// - task_mgr requests shutdown of the connection
+    /// We currently need to shut down when any of the following happens:
+    /// 1. any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
+    /// 2. task_mgr requests shutdown of the connection
     ///
-    /// The need to check for `task_mgr` cancellation arises mainly from `handle_pagerequests`
-    /// where, at first, `shard_timelines` is empty, see <https://github.com/neondatabase/neon/pull/6388>
+    /// NB on (1): the connection's lifecycle is not actually tied to any of the
+    /// `shard_timelines`s' lifecycles. But it's _necessary_ in the current
+    /// implementation to be responsive to timeline cancellation because
+    /// the connection holds their `GateGuards` open (sored in `shard_timelines`).
+    /// We currently do the easy thing and terminate the connection if any of the
+    /// shard_timelines gets cancelled. But really, we cuold spend more effort
+    /// and simply remove the cancelled timeline from the `shard_timelines`, thereby
+    /// dropping the guard.
     ///
     /// NB: keep in sync with [`Self::is_connection_cancelled`]
     async fn await_connection_cancelled(&self) {
@@ -404,16 +410,17 @@ impl PageServerHandler {
         // immutable &self).  So it's fine to evaluate shard_timelines after the sleep, we don't risk
         // missing any inserts to the map.
 
-        let mut futs = self
-            .shard_timelines
-            .values()
-            .map(|ht| ht.timeline.cancel.cancelled())
-            .collect::<FuturesUnordered<_>>();
-
-        tokio::select! {
-            _ = task_mgr::shutdown_watcher() => { }
-            _ = futs.next() => {}
-        }
+        let mut cancellation_sources = Vec::with_capacity(1 + self.shard_timelines.len());
+        use futures::future::Either;
+        cancellation_sources.push(Either::Left(task_mgr::shutdown_watcher()));
+        cancellation_sources.extend(
+            self.shard_timelines
+                .values()
+                .map(|ht| Either::Right(ht.timeline.cancel.cancelled())),
+        );
+        FuturesUnordered::from_iter(cancellation_sources)
+            .next()
+            .await;
     }
 
     /// Checking variant of [`Self::await_connection_cancelled`].

From 90e689addae554d7f79899379d3b84ff414404da Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 22 Jan 2024 15:50:32 +0000
Subject: [PATCH 113/209] pageserver: mark tenant broken when cancelling attach
 (#6430)

## Problem

When a tenant is in Attaching state, and waiting for the
`concurrent_tenant_warmup` semaphore, it also listens for the tenant
cancellation token. When that token fires, Tenant::attach drops out.
Meanwhile, Tenant::set_stopping waits forever for the tenant to exit
Attaching state.

Fixes: https://github.com/neondatabase/neon/issues/6423

## Summary of changes

- In the absence of a valid state for the tenant, it is set to Broken in
this path. A more elegant solution will require more refactoring, beyond
this minimal fix.

(cherry picked from commit 93572a3e99f572f51529b3fbb3b11dafa88f7f5c)
---
 pageserver/src/tenant.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ce99569beb..1d9b91c9ce 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -716,6 +716,10 @@ impl Tenant {
                             // stayed in Activating for such a long time that shutdown found it in
                             // that state.
                             tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
+                            // Make the tenant broken so that set_stopping will not hang waiting for it to leave
+                            // the Attaching state.  This is an over-reaction (nothing really broke, the tenant is
+                            // just shutting down), but ensures progress.
+                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
                             return Ok(());
                         },
                     )

From d0cb4b88c8178a8373d6846c160ad32271435786 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 5 Feb 2024 10:53:37 +0100
Subject: [PATCH 114/209] Don't preserve temp files on creation errors of delta
 layers (#6612)

There is currently no cleanup done after a delta layer creation error,
so delta layers can accumulate. The problem gets worse as the operation
gets retried and delta layers accumulate on the disk. Therefore, delete
them from disk (if something has been written to disk).
---
 pageserver/src/tenant/storage_layer/delta_layer.rs | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index ec031d6089..2a51884c0b 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -609,7 +609,19 @@ impl DeltaLayerWriter {
         key_end: Key,
         timeline: &Arc<Timeline>,
     ) -> anyhow::Result<ResidentLayer> {
-        self.inner.take().unwrap().finish(key_end, timeline).await
+        let inner = self.inner.take().unwrap();
+        let temp_path = inner.path.clone();
+        let result = inner.finish(key_end, timeline).await;
+        // The delta layer files can sometimes be really large. Clean them up.
+        if result.is_err() {
+            tracing::warn!(
+                "Cleaning up temporary delta file {temp_path} after error during writing"
+            );
+            if let Err(e) = std::fs::remove_file(&temp_path) {
+                tracing::warn!("Error cleaning up temporary delta layer file {temp_path}: {e:?}")
+            }
+        }
+        result
     }
 }
 

From 1cef395266bf7b439f1f14abe78d1ce8ef25d8ac Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 12 Feb 2024 14:04:46 +0100
Subject: [PATCH 115/209] Proxy: copy bidirectional fork (#6720)

## Problem

`tokio::io::copy_bidirectional` doesn't close the connection once one of
the sides closes it. It's not really suitable for the postgres protocol.

## Summary of changes

Fork `copy_bidirectional` and initiate a shutdown for both connections.

---------

Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>
---
 proxy/src/proxy.rs                    |   1 +
 proxy/src/proxy/copy_bidirectional.rs | 256 ++++++++++++++++++++++++++
 proxy/src/proxy/passthrough.rs        |   2 +-
 3 files changed, 258 insertions(+), 1 deletion(-)
 create mode 100644 proxy/src/proxy/copy_bidirectional.rs

diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 50e22ec72a..77aadb6f28 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -2,6 +2,7 @@
 mod tests;
 
 pub mod connect_compute;
+mod copy_bidirectional;
 pub mod handshake;
 pub mod passthrough;
 pub mod retry;
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
new file mode 100644
index 0000000000..2ecc1151da
--- /dev/null
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -0,0 +1,256 @@
+use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
+
+use std::future::poll_fn;
+use std::io;
+use std::pin::Pin;
+use std::task::{ready, Context, Poll};
+
+#[derive(Debug)]
+enum TransferState {
+    Running(CopyBuffer),
+    ShuttingDown(u64),
+    Done(u64),
+}
+
+fn transfer_one_direction<A, B>(
+    cx: &mut Context<'_>,
+    state: &mut TransferState,
+    r: &mut A,
+    w: &mut B,
+) -> Poll<io::Result<u64>>
+where
+    A: AsyncRead + AsyncWrite + Unpin + ?Sized,
+    B: AsyncRead + AsyncWrite + Unpin + ?Sized,
+{
+    let mut r = Pin::new(r);
+    let mut w = Pin::new(w);
+    loop {
+        match state {
+            TransferState::Running(buf) => {
+                let count = ready!(buf.poll_copy(cx, r.as_mut(), w.as_mut()))?;
+                *state = TransferState::ShuttingDown(count);
+            }
+            TransferState::ShuttingDown(count) => {
+                ready!(w.as_mut().poll_shutdown(cx))?;
+                *state = TransferState::Done(*count);
+            }
+            TransferState::Done(count) => return Poll::Ready(Ok(*count)),
+        }
+    }
+}
+
+pub(super) async fn copy_bidirectional<A, B>(
+    a: &mut A,
+    b: &mut B,
+) -> Result<(u64, u64), std::io::Error>
+where
+    A: AsyncRead + AsyncWrite + Unpin + ?Sized,
+    B: AsyncRead + AsyncWrite + Unpin + ?Sized,
+{
+    let mut a_to_b = TransferState::Running(CopyBuffer::new());
+    let mut b_to_a = TransferState::Running(CopyBuffer::new());
+
+    poll_fn(|cx| {
+        let mut a_to_b_result = transfer_one_direction(cx, &mut a_to_b, a, b)?;
+        let mut b_to_a_result = transfer_one_direction(cx, &mut b_to_a, b, a)?;
+
+        // Early termination checks
+        if let TransferState::Done(_) = a_to_b {
+            if let TransferState::Running(buf) = &b_to_a {
+                // Initiate shutdown
+                b_to_a = TransferState::ShuttingDown(buf.amt);
+                b_to_a_result = transfer_one_direction(cx, &mut b_to_a, b, a)?;
+            }
+        }
+        if let TransferState::Done(_) = b_to_a {
+            if let TransferState::Running(buf) = &a_to_b {
+                // Initiate shutdown
+                a_to_b = TransferState::ShuttingDown(buf.amt);
+                a_to_b_result = transfer_one_direction(cx, &mut a_to_b, a, b)?;
+            }
+        }
+
+        // It is not a problem if ready! returns early ... (comment remains the same)
+        let a_to_b = ready!(a_to_b_result);
+        let b_to_a = ready!(b_to_a_result);
+
+        Poll::Ready(Ok((a_to_b, b_to_a)))
+    })
+    .await
+}
+
+#[derive(Debug)]
+pub(super) struct CopyBuffer {
+    read_done: bool,
+    need_flush: bool,
+    pos: usize,
+    cap: usize,
+    amt: u64,
+    buf: Box<[u8]>,
+}
+const DEFAULT_BUF_SIZE: usize = 8 * 1024;
+
+impl CopyBuffer {
+    pub(super) fn new() -> Self {
+        Self {
+            read_done: false,
+            need_flush: false,
+            pos: 0,
+            cap: 0,
+            amt: 0,
+            buf: vec![0; DEFAULT_BUF_SIZE].into_boxed_slice(),
+        }
+    }
+
+    fn poll_fill_buf<R>(
+        &mut self,
+        cx: &mut Context<'_>,
+        reader: Pin<&mut R>,
+    ) -> Poll<io::Result<()>>
+    where
+        R: AsyncRead + ?Sized,
+    {
+        let me = &mut *self;
+        let mut buf = ReadBuf::new(&mut me.buf);
+        buf.set_filled(me.cap);
+
+        let res = reader.poll_read(cx, &mut buf);
+        if let Poll::Ready(Ok(())) = res {
+            let filled_len = buf.filled().len();
+            me.read_done = me.cap == filled_len;
+            me.cap = filled_len;
+        }
+        res
+    }
+
+    fn poll_write_buf<R, W>(
+        &mut self,
+        cx: &mut Context<'_>,
+        mut reader: Pin<&mut R>,
+        mut writer: Pin<&mut W>,
+    ) -> Poll<io::Result<usize>>
+    where
+        R: AsyncRead + ?Sized,
+        W: AsyncWrite + ?Sized,
+    {
+        let me = &mut *self;
+        match writer.as_mut().poll_write(cx, &me.buf[me.pos..me.cap]) {
+            Poll::Pending => {
+                // Top up the buffer towards full if we can read a bit more
+                // data - this should improve the chances of a large write
+                if !me.read_done && me.cap < me.buf.len() {
+                    ready!(me.poll_fill_buf(cx, reader.as_mut()))?;
+                }
+                Poll::Pending
+            }
+            res => res,
+        }
+    }
+
+    pub(super) fn poll_copy<R, W>(
+        &mut self,
+        cx: &mut Context<'_>,
+        mut reader: Pin<&mut R>,
+        mut writer: Pin<&mut W>,
+    ) -> Poll<io::Result<u64>>
+    where
+        R: AsyncRead + ?Sized,
+        W: AsyncWrite + ?Sized,
+    {
+        loop {
+            // If our buffer is empty, then we need to read some data to
+            // continue.
+            if self.pos == self.cap && !self.read_done {
+                self.pos = 0;
+                self.cap = 0;
+
+                match self.poll_fill_buf(cx, reader.as_mut()) {
+                    Poll::Ready(Ok(())) => (),
+                    Poll::Ready(Err(err)) => return Poll::Ready(Err(err)),
+                    Poll::Pending => {
+                        // Try flushing when the reader has no progress to avoid deadlock
+                        // when the reader depends on buffered writer.
+                        if self.need_flush {
+                            ready!(writer.as_mut().poll_flush(cx))?;
+                            self.need_flush = false;
+                        }
+
+                        return Poll::Pending;
+                    }
+                }
+            }
+
+            // If our buffer has some data, let's write it out!
+            while self.pos < self.cap {
+                let i = ready!(self.poll_write_buf(cx, reader.as_mut(), writer.as_mut()))?;
+                if i == 0 {
+                    return Poll::Ready(Err(io::Error::new(
+                        io::ErrorKind::WriteZero,
+                        "write zero byte into writer",
+                    )));
+                } else {
+                    self.pos += i;
+                    self.amt += i as u64;
+                    self.need_flush = true;
+                }
+            }
+
+            // If pos larger than cap, this loop will never stop.
+            // In particular, user's wrong poll_write implementation returning
+            // incorrect written length may lead to thread blocking.
+            debug_assert!(
+                self.pos <= self.cap,
+                "writer returned length larger than input slice"
+            );
+
+            // If we've written all the data and we've seen EOF, flush out the
+            // data and finish the transfer.
+            if self.pos == self.cap && self.read_done {
+                ready!(writer.as_mut().poll_flush(cx))?;
+                return Poll::Ready(Ok(self.amt));
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tokio::io::AsyncWriteExt;
+
+    #[tokio::test]
+    async fn test_early_termination_a_to_d() {
+        let (mut a_mock, mut b_mock) = tokio::io::duplex(8); // Create a mock duplex stream
+        let (mut c_mock, mut d_mock) = tokio::io::duplex(32); // Create a mock duplex stream
+
+        // Simulate 'a' finishing while there's still data for 'b'
+        a_mock.write_all(b"hello").await.unwrap();
+        a_mock.shutdown().await.unwrap();
+        d_mock.write_all(b"Neon Serverless Postgres").await.unwrap();
+
+        let result = copy_bidirectional(&mut b_mock, &mut c_mock).await.unwrap();
+
+        // Assert correct transferred amounts
+        let (a_to_d_count, d_to_a_count) = result;
+        assert_eq!(a_to_d_count, 5); // 'hello' was transferred
+        assert!(d_to_a_count <= 8); // response only partially transferred or not at all
+    }
+
+    #[tokio::test]
+    async fn test_early_termination_d_to_a() {
+        let (mut a_mock, mut b_mock) = tokio::io::duplex(32); // Create a mock duplex stream
+        let (mut c_mock, mut d_mock) = tokio::io::duplex(8); // Create a mock duplex stream
+
+        // Simulate 'a' finishing while there's still data for 'b'
+        d_mock.write_all(b"hello").await.unwrap();
+        d_mock.shutdown().await.unwrap();
+        a_mock.write_all(b"Neon Serverless Postgres").await.unwrap();
+
+        let result = copy_bidirectional(&mut b_mock, &mut c_mock).await.unwrap();
+
+        // Assert correct transferred amounts
+        let (a_to_d_count, d_to_a_count) = result;
+        assert_eq!(d_to_a_count, 5); // 'hello' was transferred
+        assert!(a_to_d_count <= 8); // response only partially transferred or not at all
+    }
+}
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index b7018c6fb5..c98f68d8d1 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -45,7 +45,7 @@ pub async fn proxy_pass(
 
     // Starting from here we only proxy the client's traffic.
     info!("performing the proxy pass...");
-    let _ = tokio::io::copy_bidirectional(&mut client, &mut compute).await?;
+    let _ = crate::proxy::copy_bidirectional::copy_bidirectional(&mut client, &mut compute).await?;
 
     Ok(())
 }

From 30b295b017e086d1478e16f6d8cff878fe66171c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 12 Feb 2024 13:14:06 +0000
Subject: [PATCH 116/209] proxy: some more parquet data (#6711)

## Summary of changes

add auth_method and database to the parquet logs
---
 proxy/src/auth/backend.rs             |  8 ++--
 proxy/src/auth/backend/classic.rs     |  8 ++--
 proxy/src/auth/backend/hacks.rs       | 12 +++--
 proxy/src/auth/backend/link.rs        |  2 +
 proxy/src/auth/credentials.rs         |  3 ++
 proxy/src/auth/flow.rs                | 17 ++++++-
 proxy/src/context.rs                  | 23 ++++++++-
 proxy/src/context/parquet.rs          | 69 ++++++++++++++++-----------
 proxy/src/proxy/tests.rs              |  2 +-
 proxy/src/serverless/sql_over_http.rs |  9 +++-
 10 files changed, 104 insertions(+), 49 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index fa2782bee3..c9f21f1cf5 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -194,8 +194,7 @@ async fn auth_quirks(
     // We now expect to see a very specific payload in the place of password.
     let (info, unauthenticated_password) = match user_info.try_into() {
         Err(info) => {
-            let res = hacks::password_hack_no_authentication(info, client, &mut ctx.latency_timer)
-                .await?;
+            let res = hacks::password_hack_no_authentication(ctx, info, client).await?;
 
             ctx.set_endpoint_id(res.info.endpoint.clone());
             tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
@@ -276,11 +275,12 @@ async fn authenticate_with_secret(
     // Perform cleartext auth if we're allowed to do that.
     // Currently, we use it for websocket connections (latency).
     if allow_cleartext {
-        return hacks::authenticate_cleartext(info, client, &mut ctx.latency_timer, secret).await;
+        ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
+        return hacks::authenticate_cleartext(ctx, info, client, secret).await;
     }
 
     // Finally, proceed with the main auth flow (SCRAM-based).
-    classic::authenticate(info, client, config, &mut ctx.latency_timer, secret).await
+    classic::authenticate(ctx, info, client, config, secret).await
 }
 
 impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 745dd75107..e855843bc3 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -4,7 +4,7 @@ use crate::{
     compute,
     config::AuthenticationConfig,
     console::AuthSecret,
-    metrics::LatencyTimer,
+    context::RequestMonitoring,
     sasl,
     stream::{PqStream, Stream},
 };
@@ -12,10 +12,10 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
 
 pub(super) async fn authenticate(
+    ctx: &mut RequestMonitoring,
     creds: ComputeUserInfo,
     client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     config: &'static AuthenticationConfig,
-    latency_timer: &mut LatencyTimer,
     secret: AuthSecret,
 ) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
     let flow = AuthFlow::new(client);
@@ -27,13 +27,11 @@ pub(super) async fn authenticate(
         }
         AuthSecret::Scram(secret) => {
             info!("auth endpoint chooses SCRAM");
-            let scram = auth::Scram(&secret);
+            let scram = auth::Scram(&secret, &mut *ctx);
 
             let auth_outcome = tokio::time::timeout(
                 config.scram_protocol_timeout,
                 async {
-                    // pause the timer while we communicate with the client
-                    let _paused = latency_timer.pause();
 
                     flow.begin(scram).await.map_err(|error| {
                         warn!(?error, "error sending scram acknowledgement");
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index b6c1a92d3c..9f60b709d4 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -4,7 +4,7 @@ use super::{
 use crate::{
     auth::{self, AuthFlow},
     console::AuthSecret,
-    metrics::LatencyTimer,
+    context::RequestMonitoring,
     sasl,
     stream::{self, Stream},
 };
@@ -16,15 +16,16 @@ use tracing::{info, warn};
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
 pub async fn authenticate_cleartext(
+    ctx: &mut RequestMonitoring,
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-    latency_timer: &mut LatencyTimer,
     secret: AuthSecret,
 ) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
     warn!("cleartext auth flow override is enabled, proceeding");
+    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
-    let _paused = latency_timer.pause();
+    let _paused = ctx.latency_timer.pause();
 
     let auth_outcome = AuthFlow::new(client)
         .begin(auth::CleartextPassword(secret))
@@ -47,14 +48,15 @@ pub async fn authenticate_cleartext(
 /// Similar to [`authenticate_cleartext`], but there's a specific password format,
 /// and passwords are not yet validated (we don't know how to validate them!)
 pub async fn password_hack_no_authentication(
+    ctx: &mut RequestMonitoring,
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<ComputeCredentials<Vec<u8>>> {
     warn!("project not specified, resorting to the password hack auth flow");
+    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
-    let _paused = latency_timer.pause();
+    let _paused = ctx.latency_timer.pause();
 
     let payload = AuthFlow::new(client)
         .begin(auth::PasswordHack)
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index c71637dd1a..bf9ebf4c18 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -61,6 +61,8 @@ pub(super) async fn authenticate(
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
+    ctx.set_auth_method(crate::context::AuthMethod::Web);
+
     // registering waiter can fail if we get unlucky with rng.
     // just try again.
     let (psql_session_id, waiter) = loop {
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index d32609e44c..d318b3be54 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -99,6 +99,9 @@ impl ComputeUserInfoMaybeEndpoint {
         // record the values if we have them
         ctx.set_application(params.get("application_name").map(SmolStr::from));
         ctx.set_user(user.clone());
+        if let Some(dbname) = params.get("database") {
+            ctx.set_dbname(dbname.into());
+        }
 
         // Project name might be passed via PG's command-line options.
         let endpoint_option = params
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index c2783e236c..dce73138c6 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -4,9 +4,11 @@ use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload};
 use crate::{
     config::TlsServerEndPoint,
     console::AuthSecret,
+    context::RequestMonitoring,
     sasl, scram,
     stream::{PqStream, Stream},
 };
+use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
 use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
 use std::io;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -23,7 +25,7 @@ pub trait AuthMethod {
 pub struct Begin;
 
 /// Use [SCRAM](crate::scram)-based auth in [`AuthFlow`].
-pub struct Scram<'a>(pub &'a scram::ServerSecret);
+pub struct Scram<'a>(pub &'a scram::ServerSecret, pub &'a mut RequestMonitoring);
 
 impl AuthMethod for Scram<'_> {
     #[inline(always)]
@@ -138,6 +140,11 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
 impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
     /// Perform user authentication. Raise an error in case authentication failed.
     pub async fn authenticate(self) -> super::Result<sasl::Outcome<scram::ScramKey>> {
+        let Scram(secret, ctx) = self.state;
+
+        // pause the timer while we communicate with the client
+        let _paused = ctx.latency_timer.pause();
+
         // Initial client message contains the chosen auth method's name.
         let msg = self.stream.read_password_message().await?;
         let sasl = sasl::FirstMessage::parse(&msg)
@@ -148,9 +155,15 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
             return Err(super::AuthError::bad_auth_method(sasl.method));
         }
 
+        match sasl.method {
+            SCRAM_SHA_256 => ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256),
+            SCRAM_SHA_256_PLUS => {
+                ctx.auth_method = Some(crate::context::AuthMethod::ScramSha256Plus)
+            }
+            _ => {}
+        }
         info!("client chooses {}", sasl.method);
 
-        let secret = self.state.0;
         let outcome = sasl::SaslStream::new(self.stream, sasl.message)
             .authenticate(scram::Exchange::new(
                 secret,
diff --git a/proxy/src/context.rs b/proxy/src/context.rs
index d2bf3f68d3..0cea53ae63 100644
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -11,7 +11,7 @@ use crate::{
     console::messages::MetricsAuxInfo,
     error::ErrorKind,
     metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
-    BranchId, EndpointId, ProjectId, RoleName,
+    BranchId, DbName, EndpointId, ProjectId, RoleName,
 };
 
 pub mod parquet;
@@ -34,9 +34,11 @@ pub struct RequestMonitoring {
     project: Option<ProjectId>,
     branch: Option<BranchId>,
     endpoint_id: Option<EndpointId>,
+    dbname: Option<DbName>,
     user: Option<RoleName>,
     application: Option<SmolStr>,
     error_kind: Option<ErrorKind>,
+    pub(crate) auth_method: Option<AuthMethod>,
     success: bool,
 
     // extra
@@ -45,6 +47,15 @@ pub struct RequestMonitoring {
     pub latency_timer: LatencyTimer,
 }
 
+#[derive(Clone, Debug)]
+pub enum AuthMethod {
+    // aka link aka passwordless
+    Web,
+    ScramSha256,
+    ScramSha256Plus,
+    Cleartext,
+}
+
 impl RequestMonitoring {
     pub fn new(
         session_id: Uuid,
@@ -62,9 +73,11 @@ impl RequestMonitoring {
             project: None,
             branch: None,
             endpoint_id: None,
+            dbname: None,
             user: None,
             application: None,
             error_kind: None,
+            auth_method: None,
             success: false,
 
             sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
@@ -106,10 +119,18 @@ impl RequestMonitoring {
         self.application = app.or_else(|| self.application.clone());
     }
 
+    pub fn set_dbname(&mut self, dbname: DbName) {
+        self.dbname = Some(dbname);
+    }
+
     pub fn set_user(&mut self, user: RoleName) {
         self.user = Some(user);
     }
 
+    pub fn set_auth_method(&mut self, auth_method: AuthMethod) {
+        self.auth_method = Some(auth_method);
+    }
+
     pub fn set_error_kind(&mut self, kind: ErrorKind) {
         ERROR_BY_KIND
             .with_label_values(&[kind.to_metric_label()])
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 0fe46915bc..ad22829183 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -84,8 +84,10 @@ struct RequestData {
     username: Option<String>,
     application_name: Option<String>,
     endpoint_id: Option<String>,
+    database: Option<String>,
     project: Option<String>,
     branch: Option<String>,
+    auth_method: Option<&'static str>,
     error: Option<&'static str>,
     /// Success is counted if we form a HTTP response with sql rows inside
     /// Or if we make it to proxy_pass
@@ -104,8 +106,15 @@ impl From<RequestMonitoring> for RequestData {
             username: value.user.as_deref().map(String::from),
             application_name: value.application.as_deref().map(String::from),
             endpoint_id: value.endpoint_id.as_deref().map(String::from),
+            database: value.dbname.as_deref().map(String::from),
             project: value.project.as_deref().map(String::from),
             branch: value.branch.as_deref().map(String::from),
+            auth_method: value.auth_method.as_ref().map(|x| match x {
+                super::AuthMethod::Web => "web",
+                super::AuthMethod::ScramSha256 => "scram_sha_256",
+                super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus",
+                super::AuthMethod::Cleartext => "cleartext",
+            }),
             protocol: value.protocol,
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
@@ -431,8 +440,10 @@ mod tests {
             application_name: Some("test".to_owned()),
             username: Some(hex::encode(rng.gen::<[u8; 4]>())),
             endpoint_id: Some(hex::encode(rng.gen::<[u8; 16]>())),
+            database: Some(hex::encode(rng.gen::<[u8; 16]>())),
             project: Some(hex::encode(rng.gen::<[u8; 16]>())),
             branch: Some(hex::encode(rng.gen::<[u8; 16]>())),
+            auth_method: None,
             protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
             region: "us-east-1",
             error: None,
@@ -505,15 +516,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1087635, 3, 6000),
-                (1087288, 3, 6000),
-                (1087444, 3, 6000),
-                (1087572, 3, 6000),
-                (1087468, 3, 6000),
-                (1087500, 3, 6000),
-                (1087533, 3, 6000),
-                (1087566, 3, 6000),
-                (362671, 1, 2000)
+                (1313727, 3, 6000),
+                (1313720, 3, 6000),
+                (1313780, 3, 6000),
+                (1313737, 3, 6000),
+                (1313867, 3, 6000),
+                (1313709, 3, 6000),
+                (1313501, 3, 6000),
+                (1313737, 3, 6000),
+                (438118, 1, 2000)
             ],
         );
 
@@ -543,11 +554,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1028637, 5, 10000),
-                (1031969, 5, 10000),
-                (1019900, 5, 10000),
-                (1020365, 5, 10000),
-                (1025010, 5, 10000)
+                (1219459, 5, 10000),
+                (1225609, 5, 10000),
+                (1227403, 5, 10000),
+                (1226765, 5, 10000),
+                (1218043, 5, 10000)
             ],
         );
 
@@ -579,11 +590,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1210770, 6, 12000),
-                (1211036, 6, 12000),
-                (1210990, 6, 12000),
-                (1210861, 6, 12000),
-                (202073, 1, 2000)
+                (1205106, 5, 10000),
+                (1204837, 5, 10000),
+                (1205130, 5, 10000),
+                (1205118, 5, 10000),
+                (1205373, 5, 10000)
             ],
         );
 
@@ -608,15 +619,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1087635, 3, 6000),
-                (1087288, 3, 6000),
-                (1087444, 3, 6000),
-                (1087572, 3, 6000),
-                (1087468, 3, 6000),
-                (1087500, 3, 6000),
-                (1087533, 3, 6000),
-                (1087566, 3, 6000),
-                (362671, 1, 2000)
+                (1313727, 3, 6000),
+                (1313720, 3, 6000),
+                (1313780, 3, 6000),
+                (1313737, 3, 6000),
+                (1313867, 3, 6000),
+                (1313709, 3, 6000),
+                (1313501, 3, 6000),
+                (1313737, 3, 6000),
+                (438118, 1, 2000)
             ],
         );
 
@@ -653,7 +664,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(545264, 2, 3001), (545025, 2, 3000), (544857, 2, 2999)],
+            [(658383, 2, 3001), (658097, 2, 3000), (657893, 2, 2999)],
         );
 
         tmpdir.close().unwrap();
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 3e961afb41..5bb43c0375 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -144,7 +144,7 @@ impl TestAuth for Scram {
         stream: &mut PqStream<Stream<S>>,
     ) -> anyhow::Result<()> {
         let outcome = auth::AuthFlow::new(stream)
-            .begin(auth::Scram(&self.0))
+            .begin(auth::Scram(&self.0, &mut RequestMonitoring::test()))
             .await?
             .authenticate()
             .await?;
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 54424360c4..e9f868d51e 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -36,6 +36,7 @@ use crate::error::ReportableError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
+use crate::DbName;
 use crate::RoleName;
 
 use super::backend::PoolingBackend;
@@ -117,6 +118,9 @@ fn get_conn_info(
     headers: &HeaderMap,
     tls: &TlsConfig,
 ) -> Result<ConnInfo, ConnInfoError> {
+    // HTTP only uses cleartext (for now and likely always)
+    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
+
     let connection_string = headers
         .get("Neon-Connection-String")
         .ok_or(ConnInfoError::InvalidHeader("Neon-Connection-String"))?
@@ -134,7 +138,8 @@ fn get_conn_info(
         .path_segments()
         .ok_or(ConnInfoError::MissingDbName)?;
 
-    let dbname = url_path.next().ok_or(ConnInfoError::InvalidDbName)?;
+    let dbname: DbName = url_path.next().ok_or(ConnInfoError::InvalidDbName)?.into();
+    ctx.set_dbname(dbname.clone());
 
     let username = RoleName::from(urlencoding::decode(connection_url.username())?);
     if username.is_empty() {
@@ -174,7 +179,7 @@ fn get_conn_info(
 
     Ok(ConnInfo {
         user_info,
-        dbname: dbname.into(),
+        dbname,
         password: match password {
             std::borrow::Cow::Borrowed(b) => b.into(),
             std::borrow::Cow::Owned(b) => b.into(),

From f1347f24173bd61c2c4469fca2336c52b0980746 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 12 Feb 2024 15:03:45 +0000
Subject: [PATCH 117/209] proxy: add more http logging (#6726)

## Problem

hard to see where time is taken during HTTP flow.

## Summary of changes

add a lot more for query state. add a conn_id field to the sql-over-http
span
---
 proxy/src/metrics.rs                  |  5 ++--
 proxy/src/serverless/backend.rs       |  8 +++----
 proxy/src/serverless/conn_pool.rs     | 22 +++++-------------
 proxy/src/serverless/sql_over_http.rs | 33 +++++++++++++++++++++++----
 4 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index ccf89f9b05..f7f162a075 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -200,8 +200,9 @@ impl LatencyTimer {
 
     pub fn success(&mut self) {
         // stop the stopwatch and record the time that we have accumulated
-        let start = self.start.take().expect("latency timer should be started");
-        self.accumulated += start.elapsed();
+        if let Some(start) = self.start.take() {
+            self.accumulated += start.elapsed();
+        }
 
         // success
         self.outcome = "success";
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 8285da68d7..156002006d 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -1,7 +1,7 @@
 use std::{sync::Arc, time::Duration};
 
 use async_trait::async_trait;
-use tracing::info;
+use tracing::{field::display, info};
 
 use crate::{
     auth::{backend::ComputeCredentialKeys, check_peer_addr_is_in_list, AuthError},
@@ -15,7 +15,7 @@ use crate::{
     proxy::connect_compute::ConnectMechanism,
 };
 
-use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool, APP_NAME};
+use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
 
 pub struct PoolingBackend {
     pub pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
@@ -81,8 +81,8 @@ impl PoolingBackend {
             return Ok(client);
         }
         let conn_id = uuid::Uuid::new_v4();
-        info!(%conn_id, "pool: opening a new connection '{conn_info}'");
-        ctx.set_application(Some(APP_NAME));
+        tracing::Span::current().record("conn_id", display(conn_id));
+        info!("pool: opening a new connection '{conn_info}'");
         let backend = self
             .config
             .auth_backend
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index f4e5b145c5..53e7c1c2ee 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -4,7 +4,6 @@ use metrics::IntCounterPairGuard;
 use parking_lot::RwLock;
 use rand::Rng;
 use smallvec::SmallVec;
-use smol_str::SmolStr;
 use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
 use std::{
     fmt,
@@ -31,8 +30,6 @@ use tracing::{info, info_span, Instrument};
 
 use super::backend::HttpConnError;
 
-pub const APP_NAME: SmolStr = SmolStr::new_inline("/sql_over_http");
-
 #[derive(Debug, Clone)]
 pub struct ConnInfo {
     pub user_info: ComputeUserInfo,
@@ -379,12 +376,13 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
                 info!("pool: cached connection '{conn_info}' is closed, opening a new one");
                 return Ok(None);
             } else {
-                info!("pool: reusing connection '{conn_info}'");
-                client.session.send(ctx.session_id)?;
+                tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
                 tracing::Span::current().record(
                     "pid",
                     &tracing::field::display(client.inner.get_process_id()),
                 );
+                info!("pool: reusing connection '{conn_info}'");
+                client.session.send(ctx.session_id)?;
                 ctx.latency_timer.pool_hit();
                 ctx.latency_timer.success();
                 return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
@@ -577,7 +575,6 @@ pub struct Client<C: ClientInnerExt> {
 }
 
 pub struct Discard<'a, C: ClientInnerExt> {
-    conn_id: uuid::Uuid,
     conn_info: &'a ConnInfo,
     pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
 }
@@ -603,14 +600,7 @@ impl<C: ClientInnerExt> Client<C> {
             span: _,
         } = self;
         let inner = inner.as_mut().expect("client inner should not be removed");
-        (
-            &mut inner.inner,
-            Discard {
-                pool,
-                conn_info,
-                conn_id: inner.conn_id,
-            },
-        )
+        (&mut inner.inner, Discard { pool, conn_info })
     }
 
     pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
@@ -625,13 +615,13 @@ impl<C: ClientInnerExt> Discard<'_, C> {
     pub fn check_idle(&mut self, status: ReadyForQueryStatus) {
         let conn_info = &self.conn_info;
         if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
-            info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is not idle")
+            info!("pool: throwing away connection '{conn_info}' because connection is not idle")
         }
     }
     pub fn discard(&mut self) {
         let conn_info = &self.conn_info;
         if std::mem::take(self.pool).strong_count() > 0 {
-            info!(conn_id = %self.conn_id, "pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
+            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state")
         }
     }
 }
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index e9f868d51e..ecb72abe73 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -36,6 +36,7 @@ use crate::error::ReportableError;
 use crate::metrics::HTTP_CONTENT_LENGTH;
 use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE;
 use crate::proxy::NeonOptions;
+use crate::serverless::backend::HttpConnError;
 use crate::DbName;
 use crate::RoleName;
 
@@ -305,7 +306,14 @@ pub async fn handle(
     Ok(response)
 }
 
-#[instrument(name = "sql-over-http", fields(pid = tracing::field::Empty), skip_all)]
+#[instrument(
+    name = "sql-over-http",
+    skip_all,
+    fields(
+        pid = tracing::field::Empty,
+        conn_id = tracing::field::Empty
+    )
+)]
 async fn handle_inner(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
@@ -359,12 +367,10 @@ async fn handle_inner(
     let txn_read_only = headers.get(&TXN_READ_ONLY) == Some(&HEADER_VALUE_TRUE);
     let txn_deferrable = headers.get(&TXN_DEFERRABLE) == Some(&HEADER_VALUE_TRUE);
 
-    let paused = ctx.latency_timer.pause();
     let request_content_length = match request.body().size_hint().upper() {
         Some(v) => v,
         None => MAX_REQUEST_SIZE + 1,
     };
-    drop(paused);
     info!(request_content_length, "request size in bytes");
     HTTP_CONTENT_LENGTH.observe(request_content_length as f64);
 
@@ -380,15 +386,20 @@ async fn handle_inner(
         let body = hyper::body::to_bytes(request.into_body())
             .await
             .map_err(anyhow::Error::from)?;
+        info!(length = body.len(), "request payload read");
         let payload: Payload = serde_json::from_slice(&body)?;
         Ok::<Payload, anyhow::Error>(payload) // Adjust error type accordingly
     };
 
     let authenticate_and_connect = async {
         let keys = backend.authenticate(ctx, &conn_info).await?;
-        backend
+        let client = backend
             .connect_to_compute(ctx, conn_info, keys, !allow_pool)
-            .await
+            .await?;
+        // not strictly necessary to mark success here,
+        // but it's just insurance for if we forget it somewhere else
+        ctx.latency_timer.success();
+        Ok::<_, HttpConnError>(client)
     };
 
     // Run both operations in parallel
@@ -420,6 +431,7 @@ async fn handle_inner(
             results
         }
         Payload::Batch(statements) => {
+            info!("starting transaction");
             let (inner, mut discard) = client.inner();
             let mut builder = inner.build_transaction();
             if let Some(isolation_level) = txn_isolation_level {
@@ -449,6 +461,7 @@ async fn handle_inner(
             .await
             {
                 Ok(results) => {
+                    info!("commit");
                     let status = transaction.commit().await.map_err(|e| {
                         // if we cannot commit - for now don't return connection to pool
                         // TODO: get a query status from the error
@@ -459,6 +472,7 @@ async fn handle_inner(
                     results
                 }
                 Err(err) => {
+                    info!("rollback");
                     let status = transaction.rollback().await.map_err(|e| {
                         // if we cannot rollback - for now don't return connection to pool
                         // TODO: get a query status from the error
@@ -533,8 +547,10 @@ async fn query_to_json<T: GenericClient>(
     raw_output: bool,
     default_array_mode: bool,
 ) -> anyhow::Result<(ReadyForQueryStatus, Value)> {
+    info!("executing query");
     let query_params = data.params;
     let row_stream = client.query_raw_txt(&data.query, query_params).await?;
+    info!("finished executing query");
 
     // Manually drain the stream into a vector to leave row_stream hanging
     // around to get a command tag. Also check that the response is not too
@@ -569,6 +585,13 @@ async fn query_to_json<T: GenericClient>(
     }
     .and_then(|s| s.parse::<i64>().ok());
 
+    info!(
+        rows = rows.len(),
+        ?ready,
+        command_tag,
+        "finished reading rows"
+    );
+
     let mut fields = vec![];
     let mut columns = vec![];
 

From 2227540a0d330dbafb885af83a6487903ae4f68e Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Mon, 12 Feb 2024 19:41:02 +0100
Subject: [PATCH 118/209] Proxy refactor auth+connect (#6708)

## Problem

Not really a problem, just refactoring.

## Summary of changes

Separate authenticate from wake compute.

Do not call wake compute second time if we managed to connect to
postgres or if we got it not from cache.
---
 proxy/src/auth.rs                  |   5 -
 proxy/src/auth/backend.rs          | 146 ++++++++++++++++-------------
 proxy/src/auth/backend/classic.rs  |   2 +-
 proxy/src/auth/backend/hacks.rs    |   6 +-
 proxy/src/bin/proxy.rs             |   2 +-
 proxy/src/compute.rs               |   8 +-
 proxy/src/config.rs                |   2 +-
 proxy/src/console/provider.rs      |  33 ++++++-
 proxy/src/console/provider/mock.rs |   4 +-
 proxy/src/error.rs                 |  12 ++-
 proxy/src/proxy.rs                 |  13 +--
 proxy/src/proxy/connect_compute.rs |  67 ++++++++-----
 proxy/src/proxy/tests.rs           | 142 +++++++++++++++++++++-------
 proxy/src/proxy/wake_compute.rs    |  16 +---
 proxy/src/serverless/backend.rs    |  40 +++-----
 15 files changed, 307 insertions(+), 191 deletions(-)

diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index 48de4e2353..c8028d1bf0 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -36,9 +36,6 @@ pub enum AuthErrorImpl {
     #[error(transparent)]
     GetAuthInfo(#[from] console::errors::GetAuthInfoError),
 
-    #[error(transparent)]
-    WakeCompute(#[from] console::errors::WakeComputeError),
-
     /// SASL protocol errors (includes [SCRAM](crate::scram)).
     #[error(transparent)]
     Sasl(#[from] crate::sasl::Error),
@@ -119,7 +116,6 @@ impl UserFacingError for AuthError {
         match self.0.as_ref() {
             Link(e) => e.to_string_client(),
             GetAuthInfo(e) => e.to_string_client(),
-            WakeCompute(e) => e.to_string_client(),
             Sasl(e) => e.to_string_client(),
             AuthFailed(_) => self.to_string(),
             BadAuthMethod(_) => self.to_string(),
@@ -139,7 +135,6 @@ impl ReportableError for AuthError {
         match self.0.as_ref() {
             Link(e) => e.get_error_kind(),
             GetAuthInfo(e) => e.get_error_kind(),
-            WakeCompute(e) => e.get_error_kind(),
             Sasl(e) => e.get_error_kind(),
             AuthFailed(_) => crate::error::ErrorKind::User,
             BadAuthMethod(_) => crate::error::ErrorKind::User,
diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index c9f21f1cf5..47c1dc4e92 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -10,9 +10,9 @@ use crate::auth::validate_password_and_exchange;
 use crate::cache::Cached;
 use crate::console::errors::GetAuthInfoError;
 use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
-use crate::console::AuthSecret;
+use crate::console::{AuthSecret, NodeInfo};
 use crate::context::RequestMonitoring;
-use crate::proxy::wake_compute::wake_compute;
+use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
 use crate::stream::Stream;
 use crate::{
@@ -26,7 +26,6 @@ use crate::{
     stream, url,
 };
 use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
-use futures::TryFutureExt;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
@@ -56,11 +55,11 @@ impl<T> std::ops::Deref for MaybeOwned<'_, T> {
 /// * However, when we substitute `T` with [`ComputeUserInfoMaybeEndpoint`],
 ///   this helps us provide the credentials only to those auth
 ///   backends which require them for the authentication process.
-pub enum BackendType<'a, T> {
+pub enum BackendType<'a, T, D> {
     /// Cloud API (V2).
     Console(MaybeOwned<'a, ConsoleBackend>, T),
     /// Authentication via a web browser.
-    Link(MaybeOwned<'a, url::ApiUrl>),
+    Link(MaybeOwned<'a, url::ApiUrl>, D),
 }
 
 pub trait TestBackend: Send + Sync + 'static {
@@ -71,7 +70,7 @@ pub trait TestBackend: Send + Sync + 'static {
     fn get_role_secret(&self) -> Result<CachedRoleSecret, console::errors::GetAuthInfoError>;
 }
 
-impl std::fmt::Display for BackendType<'_, ()> {
+impl std::fmt::Display for BackendType<'_, (), ()> {
     fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         use BackendType::*;
         match self {
@@ -86,51 +85,50 @@ impl std::fmt::Display for BackendType<'_, ()> {
                 #[cfg(test)]
                 ConsoleBackend::Test(_) => fmt.debug_tuple("Test").finish(),
             },
-            Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Link(url, _) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
         }
     }
 }
 
-impl<T> BackendType<'_, T> {
+impl<T, D> BackendType<'_, T, D> {
     /// Very similar to [`std::option::Option::as_ref`].
     /// This helps us pass structured config to async tasks.
-    pub fn as_ref(&self) -> BackendType<'_, &T> {
+    pub fn as_ref(&self) -> BackendType<'_, &T, &D> {
         use BackendType::*;
         match self {
             Console(c, x) => Console(MaybeOwned::Borrowed(c), x),
-            Link(c) => Link(MaybeOwned::Borrowed(c)),
+            Link(c, x) => Link(MaybeOwned::Borrowed(c), x),
         }
     }
 }
 
-impl<'a, T> BackendType<'a, T> {
+impl<'a, T, D> BackendType<'a, T, D> {
     /// Very similar to [`std::option::Option::map`].
     /// Maps [`BackendType<T>`] to [`BackendType<R>`] by applying
     /// a function to a contained value.
-    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R> {
+    pub fn map<R>(self, f: impl FnOnce(T) -> R) -> BackendType<'a, R, D> {
         use BackendType::*;
         match self {
             Console(c, x) => Console(c, f(x)),
-            Link(c) => Link(c),
+            Link(c, x) => Link(c, x),
         }
     }
 }
-
-impl<'a, T, E> BackendType<'a, Result<T, E>> {
+impl<'a, T, D, E> BackendType<'a, Result<T, E>, D> {
     /// Very similar to [`std::option::Option::transpose`].
     /// This is most useful for error handling.
-    pub fn transpose(self) -> Result<BackendType<'a, T>, E> {
+    pub fn transpose(self) -> Result<BackendType<'a, T, D>, E> {
         use BackendType::*;
         match self {
             Console(c, x) => x.map(|x| Console(c, x)),
-            Link(c) => Ok(Link(c)),
+            Link(c, x) => Ok(Link(c, x)),
         }
     }
 }
 
-pub struct ComputeCredentials<T> {
+pub struct ComputeCredentials {
     pub info: ComputeUserInfo,
-    pub keys: T,
+    pub keys: ComputeCredentialKeys,
 }
 
 #[derive(Debug, Clone)]
@@ -153,7 +151,6 @@ impl ComputeUserInfo {
 }
 
 pub enum ComputeCredentialKeys {
-    #[cfg(any(test, feature = "testing"))]
     Password(Vec<u8>),
     AuthKeys(AuthKeys),
 }
@@ -188,7 +185,7 @@ async fn auth_quirks(
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     allow_cleartext: bool,
     config: &'static AuthenticationConfig,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<ComputeCredentials> {
     // If there's no project so far, that entails that client doesn't
     // support SNI or other means of passing the endpoint (project) name.
     // We now expect to see a very specific payload in the place of password.
@@ -198,8 +195,11 @@ async fn auth_quirks(
 
             ctx.set_endpoint_id(res.info.endpoint.clone());
             tracing::Span::current().record("ep", &tracing::field::display(&res.info.endpoint));
-
-            (res.info, Some(res.keys))
+            let password = match res.keys {
+                ComputeCredentialKeys::Password(p) => p,
+                _ => unreachable!("password hack should return a password"),
+            };
+            (res.info, Some(password))
         }
         Ok(info) => (info, None),
     };
@@ -253,7 +253,7 @@ async fn authenticate_with_secret(
     unauthenticated_password: Option<Vec<u8>>,
     allow_cleartext: bool,
     config: &'static AuthenticationConfig,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<ComputeCredentials> {
     if let Some(password) = unauthenticated_password {
         let auth_outcome = validate_password_and_exchange(&password, secret)?;
         let keys = match auth_outcome {
@@ -283,14 +283,14 @@ async fn authenticate_with_secret(
     classic::authenticate(ctx, info, client, config, secret).await
 }
 
-impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
+impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint, &()> {
     /// Get compute endpoint name from the credentials.
     pub fn get_endpoint(&self) -> Option<EndpointId> {
         use BackendType::*;
 
         match self {
             Console(_, user_info) => user_info.endpoint_id.clone(),
-            Link(_) => Some("link".into()),
+            Link(_, _) => Some("link".into()),
         }
     }
 
@@ -300,7 +300,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
 
         match self {
             Console(_, user_info) => &user_info.user,
-            Link(_) => "link",
+            Link(_, _) => "link",
         }
     }
 
@@ -312,7 +312,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
         client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
         allow_cleartext: bool,
         config: &'static AuthenticationConfig,
-    ) -> auth::Result<(CachedNodeInfo, BackendType<'a, ComputeUserInfo>)> {
+    ) -> auth::Result<BackendType<'a, ComputeCredentials, NodeInfo>> {
         use BackendType::*;
 
         let res = match self {
@@ -323,33 +323,17 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
                     "performing authentication using the console"
                 );
 
-                let compute_credentials =
+                let credentials =
                     auth_quirks(ctx, &*api, user_info, client, allow_cleartext, config).await?;
-
-                let mut num_retries = 0;
-                let mut node =
-                    wake_compute(&mut num_retries, ctx, &api, &compute_credentials.info).await?;
-
-                ctx.set_project(node.aux.clone());
-
-                match compute_credentials.keys {
-                    #[cfg(any(test, feature = "testing"))]
-                    ComputeCredentialKeys::Password(password) => node.config.password(password),
-                    ComputeCredentialKeys::AuthKeys(auth_keys) => node.config.auth_keys(auth_keys),
-                };
-
-                (node, BackendType::Console(api, compute_credentials.info))
+                BackendType::Console(api, credentials)
             }
             // NOTE: this auth backend doesn't use client credentials.
-            Link(url) => {
+            Link(url, _) => {
                 info!("performing link authentication");
 
-                let node_info = link::authenticate(ctx, &url, client).await?;
+                let info = link::authenticate(ctx, &url, client).await?;
 
-                (
-                    CachedNodeInfo::new_uncached(node_info),
-                    BackendType::Link(url),
-                )
+                BackendType::Link(url, info)
             }
         };
 
@@ -358,7 +342,7 @@ impl<'a> BackendType<'a, ComputeUserInfoMaybeEndpoint> {
     }
 }
 
-impl BackendType<'_, ComputeUserInfo> {
+impl BackendType<'_, ComputeUserInfo, &()> {
     pub async fn get_role_secret(
         &self,
         ctx: &mut RequestMonitoring,
@@ -366,7 +350,7 @@ impl BackendType<'_, ComputeUserInfo> {
         use BackendType::*;
         match self {
             Console(api, user_info) => api.get_role_secret(ctx, user_info).await,
-            Link(_) => Ok(Cached::new_uncached(None)),
+            Link(_, _) => Ok(Cached::new_uncached(None)),
         }
     }
 
@@ -377,21 +361,51 @@ impl BackendType<'_, ComputeUserInfo> {
         use BackendType::*;
         match self {
             Console(api, user_info) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            Link(_) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
-        }
-    }
-
-    /// When applicable, wake the compute node, gaining its connection info in the process.
-    /// The link auth flow doesn't support this, so we return [`None`] in that case.
-    pub async fn wake_compute(
-        &self,
-        ctx: &mut RequestMonitoring,
-    ) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
-        use BackendType::*;
-
-        match self {
-            Console(api, user_info) => api.wake_compute(ctx, user_info).map_ok(Some).await,
-            Link(_) => Ok(None),
+            Link(_, _) => Ok((Cached::new_uncached(Arc::new(vec![])), None)),
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, NodeInfo> {
+    async fn wake_compute(
+        &self,
+        ctx: &mut RequestMonitoring,
+    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
+        use BackendType::*;
+
+        match self {
+            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Link(_, info) => Ok(Cached::new_uncached(info.clone())),
+        }
+    }
+
+    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
+        match self {
+            BackendType::Console(_, creds) => Some(&creds.keys),
+            BackendType::Link(_, _) => None,
+        }
+    }
+}
+
+#[async_trait::async_trait]
+impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {
+    async fn wake_compute(
+        &self,
+        ctx: &mut RequestMonitoring,
+    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
+        use BackendType::*;
+
+        match self {
+            Console(api, creds) => api.wake_compute(ctx, &creds.info).await,
+            Link(_, _) => unreachable!("link auth flow doesn't support waking the compute"),
+        }
+    }
+
+    fn get_keys(&self) -> Option<&ComputeCredentialKeys> {
+        match self {
+            BackendType::Console(_, creds) => Some(&creds.keys),
+            BackendType::Link(_, _) => None,
         }
     }
 }
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index e855843bc3..d075331846 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -17,7 +17,7 @@ pub(super) async fn authenticate(
     client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     config: &'static AuthenticationConfig,
     secret: AuthSecret,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<ComputeCredentials> {
     let flow = AuthFlow::new(client);
     let scram_keys = match secret {
         #[cfg(any(test, feature = "testing"))]
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 9f60b709d4..26cf7a01f2 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -20,7 +20,7 @@ pub async fn authenticate_cleartext(
     info: ComputeUserInfo,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     secret: AuthSecret,
-) -> auth::Result<ComputeCredentials<ComputeCredentialKeys>> {
+) -> auth::Result<ComputeCredentials> {
     warn!("cleartext auth flow override is enabled, proceeding");
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
@@ -51,7 +51,7 @@ pub async fn password_hack_no_authentication(
     ctx: &mut RequestMonitoring,
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
-) -> auth::Result<ComputeCredentials<Vec<u8>>> {
+) -> auth::Result<ComputeCredentials> {
     warn!("project not specified, resorting to the password hack auth flow");
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
@@ -73,6 +73,6 @@ pub async fn password_hack_no_authentication(
             options: info.options,
             endpoint: payload.endpoint,
         },
-        keys: payload.password,
+        keys: ComputeCredentialKeys::Password(payload.password),
     })
 }
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 8fbcb56758..00a229c135 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -383,7 +383,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         }
         AuthBackend::Link => {
             let url = args.uri.parse()?;
-            auth::BackendType::Link(MaybeOwned::Owned(url))
+            auth::BackendType::Link(MaybeOwned::Owned(url), ())
         }
     };
     let http_config = HttpConfig {
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 83940d80ec..b61c1fb9ef 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,7 +1,7 @@
 use crate::{
     auth::parse_endpoint_param,
     cancellation::CancelClosure,
-    console::errors::WakeComputeError,
+    console::{errors::WakeComputeError, messages::MetricsAuxInfo},
     context::RequestMonitoring,
     error::{ReportableError, UserFacingError},
     metrics::NUM_DB_CONNECTIONS_GAUGE,
@@ -93,7 +93,7 @@ impl ConnCfg {
     }
 
     /// Reuse password or auth keys from the other config.
-    pub fn reuse_password(&mut self, other: &Self) {
+    pub fn reuse_password(&mut self, other: Self) {
         if let Some(password) = other.get_password() {
             self.password(password);
         }
@@ -253,6 +253,8 @@ pub struct PostgresConnection {
     pub params: std::collections::HashMap<String, String>,
     /// Query cancellation token.
     pub cancel_closure: CancelClosure,
+    /// Labels for proxy's metrics.
+    pub aux: MetricsAuxInfo,
 
     _guage: IntCounterPairGuard,
 }
@@ -263,6 +265,7 @@ impl ConnCfg {
         &self,
         ctx: &mut RequestMonitoring,
         allow_self_signed_compute: bool,
+        aux: MetricsAuxInfo,
         timeout: Duration,
     ) -> Result<PostgresConnection, ConnectionError> {
         let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
@@ -297,6 +300,7 @@ impl ConnCfg {
             stream,
             params,
             cancel_closure,
+            aux,
             _guage: NUM_DB_CONNECTIONS_GAUGE
                 .with_label_values(&[ctx.protocol])
                 .guard(),
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 31c9228b35..5fcb537834 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -13,7 +13,7 @@ use x509_parser::oid_registry;
 
 pub struct ProxyConfig {
     pub tls_config: Option<TlsConfig>,
-    pub auth_backend: auth::BackendType<'static, ()>,
+    pub auth_backend: auth::BackendType<'static, (), ()>,
     pub metric_collection: Option<MetricCollectionConfig>,
     pub allow_self_signed_compute: bool,
     pub http_config: HttpConfig,
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index e5cad42753..640444d14e 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -4,7 +4,10 @@ pub mod neon;
 
 use super::messages::MetricsAuxInfo;
 use crate::{
-    auth::{backend::ComputeUserInfo, IpPattern},
+    auth::{
+        backend::{ComputeCredentialKeys, ComputeUserInfo},
+        IpPattern,
+    },
     cache::{project_info::ProjectInfoCacheImpl, Cached, TimedLru},
     compute,
     config::{CacheOptions, ProjectInfoCacheOptions},
@@ -261,6 +264,34 @@ pub struct NodeInfo {
     pub allow_self_signed_compute: bool,
 }
 
+impl NodeInfo {
+    pub async fn connect(
+        &self,
+        ctx: &mut RequestMonitoring,
+        timeout: Duration,
+    ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
+        self.config
+            .connect(
+                ctx,
+                self.allow_self_signed_compute,
+                self.aux.clone(),
+                timeout,
+            )
+            .await
+    }
+    pub fn reuse_settings(&mut self, other: Self) {
+        self.allow_self_signed_compute = other.allow_self_signed_compute;
+        self.config.reuse_password(other.config);
+    }
+
+    pub fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
+        match keys {
+            ComputeCredentialKeys::Password(password) => self.config.password(password),
+            ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
+        };
+    }
+}
+
 pub type NodeInfoCache = TimedLru<EndpointCacheKey, NodeInfo>;
 pub type CachedNodeInfo = Cached<&'static NodeInfoCache>;
 pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
diff --git a/proxy/src/console/provider/mock.rs b/proxy/src/console/provider/mock.rs
index 79a04f255d..0579ef6fc4 100644
--- a/proxy/src/console/provider/mock.rs
+++ b/proxy/src/console/provider/mock.rs
@@ -176,9 +176,7 @@ impl super::Api for Api {
         _ctx: &mut RequestMonitoring,
         _user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, WakeComputeError> {
-        self.do_wake_compute()
-            .map_ok(CachedNodeInfo::new_uncached)
-            .await
+        self.do_wake_compute().map_ok(Cached::new_uncached).await
     }
 }
 
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index eafe92bf48..69fe1ebc12 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -29,7 +29,7 @@ pub trait UserFacingError: ReportableError {
     }
 }
 
-#[derive(Copy, Clone, Debug)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq)]
 pub enum ErrorKind {
     /// Wrong password, unknown endpoint, protocol violation, etc...
     User,
@@ -90,3 +90,13 @@ impl ReportableError for tokio::time::error::Elapsed {
         ErrorKind::RateLimit
     }
 }
+
+impl ReportableError for tokio_postgres::error::Error {
+    fn get_error_kind(&self) -> ErrorKind {
+        if self.as_db_error().is_some() {
+            ErrorKind::Postgres
+        } else {
+            ErrorKind::Compute
+        }
+    }
+}
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 77aadb6f28..5f65de4c98 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -163,14 +163,14 @@ pub enum ClientMode {
 
 /// Abstracts the logic of handling TCP vs WS clients
 impl ClientMode {
-    fn allow_cleartext(&self) -> bool {
+    pub fn allow_cleartext(&self) -> bool {
         match self {
             ClientMode::Tcp => false,
             ClientMode::Websockets { .. } => true,
         }
     }
 
-    fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool {
+    pub fn allow_self_signed_compute(&self, config: &ProxyConfig) -> bool {
         match self {
             ClientMode::Tcp => config.allow_self_signed_compute,
             ClientMode::Websockets { .. } => false,
@@ -287,7 +287,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     }
 
     let user = user_info.get_user().to_owned();
-    let (mut node_info, user_info) = match user_info
+    let user_info = match user_info
         .authenticate(
             ctx,
             &mut stream,
@@ -306,14 +306,11 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         }
     };
 
-    node_info.allow_self_signed_compute = mode.allow_self_signed_compute(config);
-
-    let aux = node_info.aux.clone();
     let mut node = connect_to_compute(
         ctx,
         &TcpMechanism { params: &params },
-        node_info,
         &user_info,
+        mode.allow_self_signed_compute(config),
     )
     .or_else(|e| stream.throw_error(e))
     .await?;
@@ -330,8 +327,8 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
 
     Ok(Some(ProxyPassthrough {
         client: stream,
+        aux: node.aux.clone(),
         compute: node,
-        aux,
         req: _request_gauge,
         conn: _client_gauge,
     }))
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index b9346aa743..6e57caf998 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -1,8 +1,9 @@
 use crate::{
-    auth,
+    auth::backend::ComputeCredentialKeys,
     compute::{self, PostgresConnection},
-    console::{self, errors::WakeComputeError},
+    console::{self, errors::WakeComputeError, CachedNodeInfo, NodeInfo},
     context::RequestMonitoring,
+    error::ReportableError,
     metrics::NUM_CONNECTION_FAILURES,
     proxy::{
         retry::{retry_after, ShouldRetry},
@@ -20,7 +21,7 @@ const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
 /// (e.g. the compute node's address might've changed at the wrong time).
 /// Invalidate the cache entry (if any) to prevent subsequent errors.
 #[tracing::instrument(name = "invalidate_cache", skip_all)]
-pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg {
+pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> NodeInfo {
     let is_cached = node_info.cached();
     if is_cached {
         warn!("invalidating stalled compute node info cache entry");
@@ -31,13 +32,13 @@ pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg
     };
     NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc();
 
-    node_info.invalidate().config
+    node_info.invalidate()
 }
 
 #[async_trait]
 pub trait ConnectMechanism {
     type Connection;
-    type ConnectError;
+    type ConnectError: ReportableError;
     type Error: From<Self::ConnectError>;
     async fn connect_once(
         &self,
@@ -49,6 +50,16 @@ pub trait ConnectMechanism {
     fn update_connect_config(&self, conf: &mut compute::ConnCfg);
 }
 
+#[async_trait]
+pub trait ComputeConnectBackend {
+    async fn wake_compute(
+        &self,
+        ctx: &mut RequestMonitoring,
+    ) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
+
+    fn get_keys(&self) -> Option<&ComputeCredentialKeys>;
+}
+
 pub struct TcpMechanism<'a> {
     /// KV-dictionary with PostgreSQL connection params.
     pub params: &'a StartupMessageParams,
@@ -67,11 +78,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
         node_info: &console::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
-        let allow_self_signed_compute = node_info.allow_self_signed_compute;
-        node_info
-            .config
-            .connect(ctx, allow_self_signed_compute, timeout)
-            .await
+        node_info.connect(ctx, timeout).await
     }
 
     fn update_connect_config(&self, config: &mut compute::ConnCfg) {
@@ -82,16 +89,23 @@ impl ConnectMechanism for TcpMechanism<'_> {
 /// Try to connect to the compute node, retrying if necessary.
 /// This function might update `node_info`, so we take it by `&mut`.
 #[tracing::instrument(skip_all)]
-pub async fn connect_to_compute<M: ConnectMechanism>(
+pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
     ctx: &mut RequestMonitoring,
     mechanism: &M,
-    mut node_info: console::CachedNodeInfo,
-    user_info: &auth::BackendType<'_, auth::backend::ComputeUserInfo>,
+    user_info: &B,
+    allow_self_signed_compute: bool,
 ) -> Result<M::Connection, M::Error>
 where
     M::ConnectError: ShouldRetry + std::fmt::Debug,
     M::Error: From<WakeComputeError>,
 {
+    let mut num_retries = 0;
+    let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+    if let Some(keys) = user_info.get_keys() {
+        node_info.set_keys(keys);
+    }
+    node_info.allow_self_signed_compute = allow_self_signed_compute;
+    // let mut node_info = credentials.get_node_info(ctx, user_info).await?;
     mechanism.update_connect_config(&mut node_info.config);
 
     // try once
@@ -108,28 +122,31 @@ where
 
     error!(error = ?err, "could not connect to compute node");
 
-    let mut num_retries = 1;
-
-    match user_info {
-        auth::BackendType::Console(api, info) => {
+    let node_info =
+        if err.get_error_kind() == crate::error::ErrorKind::Postgres || !node_info.cached() {
+            // If the error is Postgres, that means that we managed to connect to the compute node, but there was an error.
+            // Do not need to retrieve a new node_info, just return the old one.
+            if !err.should_retry(num_retries) {
+                return Err(err.into());
+            }
+            node_info
+        } else {
             // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
             info!("compute node's state has likely changed; requesting a wake-up");
-
             ctx.latency_timer.cache_miss();
-            let config = invalidate_cache(node_info);
-            node_info = wake_compute(&mut num_retries, ctx, api, info).await?;
+            let old_node_info = invalidate_cache(node_info);
+            let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+            node_info.reuse_settings(old_node_info);
 
-            node_info.config.reuse_password(&config);
             mechanism.update_connect_config(&mut node_info.config);
-        }
-        // nothing to do?
-        auth::BackendType::Link(_) => {}
-    };
+            node_info
+        };
 
     // now that we have a new node, try connect to it repeatedly.
     // this can error for a few reasons, for instance:
     // * DNS connection settings haven't quite propagated yet
     info!("wake_compute success. attempting to connect");
+    num_retries = 1;
     loop {
         match mechanism
             .connect_once(ctx, &node_info, CONNECT_TIMEOUT)
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 5bb43c0375..efbd661bbf 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -2,13 +2,19 @@
 
 mod mitm;
 
+use std::time::Duration;
+
 use super::connect_compute::ConnectMechanism;
 use super::retry::ShouldRetry;
 use super::*;
-use crate::auth::backend::{ComputeUserInfo, MaybeOwned, TestBackend};
+use crate::auth::backend::{
+    ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
+};
 use crate::config::CertResolver;
+use crate::console::caches::NodeInfoCache;
 use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
 use crate::console::{self, CachedNodeInfo, NodeInfo};
+use crate::error::ErrorKind;
 use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
 use crate::{auth, http, sasl, scram};
 use async_trait::async_trait;
@@ -369,12 +375,15 @@ enum ConnectAction {
     Connect,
     Retry,
     Fail,
+    RetryPg,
+    FailPg,
 }
 
 #[derive(Clone)]
 struct TestConnectMechanism {
     counter: Arc<std::sync::Mutex<usize>>,
     sequence: Vec<ConnectAction>,
+    cache: &'static NodeInfoCache,
 }
 
 impl TestConnectMechanism {
@@ -393,6 +402,12 @@ impl TestConnectMechanism {
         Self {
             counter: Arc::new(std::sync::Mutex::new(0)),
             sequence,
+            cache: Box::leak(Box::new(NodeInfoCache::new(
+                "test",
+                1,
+                Duration::from_secs(100),
+                false,
+            ))),
         }
     }
 }
@@ -403,6 +418,13 @@ struct TestConnection;
 #[derive(Debug)]
 struct TestConnectError {
     retryable: bool,
+    kind: crate::error::ErrorKind,
+}
+
+impl ReportableError for TestConnectError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        self.kind
+    }
 }
 
 impl std::fmt::Display for TestConnectError {
@@ -436,8 +458,22 @@ impl ConnectMechanism for TestConnectMechanism {
         *counter += 1;
         match action {
             ConnectAction::Connect => Ok(TestConnection),
-            ConnectAction::Retry => Err(TestConnectError { retryable: true }),
-            ConnectAction::Fail => Err(TestConnectError { retryable: false }),
+            ConnectAction::Retry => Err(TestConnectError {
+                retryable: true,
+                kind: ErrorKind::Compute,
+            }),
+            ConnectAction::Fail => Err(TestConnectError {
+                retryable: false,
+                kind: ErrorKind::Compute,
+            }),
+            ConnectAction::FailPg => Err(TestConnectError {
+                retryable: false,
+                kind: ErrorKind::Postgres,
+            }),
+            ConnectAction::RetryPg => Err(TestConnectError {
+                retryable: true,
+                kind: ErrorKind::Postgres,
+            }),
             x => panic!("expecting action {:?}, connect is called instead", x),
         }
     }
@@ -451,7 +487,7 @@ impl TestBackend for TestConnectMechanism {
         let action = self.sequence[*counter];
         *counter += 1;
         match action {
-            ConnectAction::Wake => Ok(helper_create_cached_node_info()),
+            ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
             ConnectAction::WakeFail => {
                 let err = console::errors::ApiError::Console {
                     status: http::StatusCode::FORBIDDEN,
@@ -483,37 +519,41 @@ impl TestBackend for TestConnectMechanism {
     }
 }
 
-fn helper_create_cached_node_info() -> CachedNodeInfo {
+fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
     let node = NodeInfo {
         config: compute::ConnCfg::new(),
         aux: Default::default(),
         allow_self_signed_compute: false,
     };
-    CachedNodeInfo::new_uncached(node)
+    let (_, node) = cache.insert("key".into(), node);
+    node
 }
 
 fn helper_create_connect_info(
     mechanism: &TestConnectMechanism,
-) -> (CachedNodeInfo, auth::BackendType<'static, ComputeUserInfo>) {
-    let cache = helper_create_cached_node_info();
+) -> auth::BackendType<'static, ComputeCredentials, &()> {
     let user_info = auth::BackendType::Console(
         MaybeOwned::Owned(ConsoleBackend::Test(Box::new(mechanism.clone()))),
-        ComputeUserInfo {
-            endpoint: "endpoint".into(),
-            user: "user".into(),
-            options: NeonOptions::parse_options_raw(""),
+        ComputeCredentials {
+            info: ComputeUserInfo {
+                endpoint: "endpoint".into(),
+                user: "user".into(),
+                options: NeonOptions::parse_options_raw(""),
+            },
+            keys: ComputeCredentialKeys::Password("password".into()),
         },
     );
-    (cache, user_info)
+    user_info
 }
 
 #[tokio::test]
 async fn connect_to_compute_success() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Connect]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![Wake, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap();
     mechanism.verify();
@@ -521,24 +561,52 @@ async fn connect_to_compute_success() {
 
 #[tokio::test]
 async fn connect_to_compute_retry() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap();
     mechanism.verify();
 }
 
+#[tokio::test]
+async fn connect_to_compute_retry_pg() {
+    let _ = env_logger::try_init();
+    use ConnectAction::*;
+    let mut ctx = RequestMonitoring::test();
+    let mechanism = TestConnectMechanism::new(vec![Wake, RetryPg, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+        .await
+        .unwrap();
+    mechanism.verify();
+}
+
+#[tokio::test]
+async fn connect_to_compute_fail_pg() {
+    let _ = env_logger::try_init();
+    use ConnectAction::*;
+    let mut ctx = RequestMonitoring::test();
+    let mechanism = TestConnectMechanism::new(vec![Wake, FailPg]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
+        .await
+        .unwrap_err();
+    mechanism.verify();
+}
+
 /// Test that we don't retry if the error is not retryable.
 #[tokio::test]
 async fn connect_to_compute_non_retry_1() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![Wake, Retry, Wake, Fail]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap_err();
     mechanism.verify();
@@ -547,11 +615,12 @@ async fn connect_to_compute_non_retry_1() {
 /// Even for non-retryable errors, we should retry at least once.
 #[tokio::test]
 async fn connect_to_compute_non_retry_2() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![Wake, Fail, Wake, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap();
     mechanism.verify();
@@ -560,15 +629,16 @@ async fn connect_to_compute_non_retry_2() {
 /// Retry for at most `NUM_RETRIES_CONNECT` times.
 #[tokio::test]
 async fn connect_to_compute_non_retry_3() {
+    let _ = env_logger::try_init();
     assert_eq!(NUM_RETRIES_CONNECT, 16);
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
     let mechanism = TestConnectMechanism::new(vec![
-        Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
-        Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
+        Wake, Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
+        Retry, Retry, Retry, Retry, Retry, /* the 17th time */ Retry,
     ]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap_err();
     mechanism.verify();
@@ -577,11 +647,12 @@ async fn connect_to_compute_non_retry_3() {
 /// Should retry wake compute.
 #[tokio::test]
 async fn wake_retry() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![WakeRetry, Wake, Connect]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap();
     mechanism.verify();
@@ -590,11 +661,12 @@ async fn wake_retry() {
 /// Wake failed with a non-retryable error.
 #[tokio::test]
 async fn wake_non_retry() {
+    let _ = env_logger::try_init();
     use ConnectAction::*;
     let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]);
-    let (cache, user_info) = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, cache, &user_info)
+    let mechanism = TestConnectMechanism::new(vec![WakeRetry, WakeFail]);
+    let user_info = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
         .await
         .unwrap_err();
     mechanism.verify();
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 925727bdab..2c593451b4 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,9 +1,4 @@
-use crate::auth::backend::ComputeUserInfo;
-use crate::console::{
-    errors::WakeComputeError,
-    provider::{CachedNodeInfo, ConsoleBackend},
-    Api,
-};
+use crate::console::{errors::WakeComputeError, provider::CachedNodeInfo};
 use crate::context::RequestMonitoring;
 use crate::metrics::{bool_to_str, NUM_WAKEUP_FAILURES};
 use crate::proxy::retry::retry_after;
@@ -11,17 +6,16 @@ use hyper::StatusCode;
 use std::ops::ControlFlow;
 use tracing::{error, warn};
 
+use super::connect_compute::ComputeConnectBackend;
 use super::retry::ShouldRetry;
 
-/// wake a compute (or retrieve an existing compute session from cache)
-pub async fn wake_compute(
+pub async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
     ctx: &mut RequestMonitoring,
-    api: &ConsoleBackend,
-    info: &ComputeUserInfo,
+    api: &B,
 ) -> Result<CachedNodeInfo, WakeComputeError> {
     loop {
-        let wake_res = api.wake_compute(ctx, info).await;
+        let wake_res = api.wake_compute(ctx).await;
         match handle_try_wake(wake_res, *num_retries) {
             Err(e) => {
                 error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 156002006d..6f93f86d5f 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -4,7 +4,7 @@ use async_trait::async_trait;
 use tracing::{field::display, info};
 
 use crate::{
-    auth::{backend::ComputeCredentialKeys, check_peer_addr_is_in_list, AuthError},
+    auth::{backend::ComputeCredentials, check_peer_addr_is_in_list, AuthError},
     compute,
     config::ProxyConfig,
     console::{
@@ -27,7 +27,7 @@ impl PoolingBackend {
         &self,
         ctx: &mut RequestMonitoring,
         conn_info: &ConnInfo,
-    ) -> Result<ComputeCredentialKeys, AuthError> {
+    ) -> Result<ComputeCredentials, AuthError> {
         let user_info = conn_info.user_info.clone();
         let backend = self.config.auth_backend.as_ref().map(|_| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
@@ -49,13 +49,17 @@ impl PoolingBackend {
         };
         let auth_outcome =
             crate::auth::validate_password_and_exchange(&conn_info.password, secret)?;
-        match auth_outcome {
+        let res = match auth_outcome {
             crate::sasl::Outcome::Success(key) => Ok(key),
             crate::sasl::Outcome::Failure(reason) => {
                 info!("auth backend failed with an error: {reason}");
                 Err(AuthError::auth_failed(&*conn_info.user_info.user))
             }
-        }
+        };
+        res.map(|key| ComputeCredentials {
+            info: user_info,
+            keys: key,
+        })
     }
 
     // Wake up the destination if needed. Code here is a bit involved because
@@ -66,7 +70,7 @@ impl PoolingBackend {
         &self,
         ctx: &mut RequestMonitoring,
         conn_info: ConnInfo,
-        keys: ComputeCredentialKeys,
+        keys: ComputeCredentials,
         force_new: bool,
     ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
         let maybe_client = if !force_new {
@@ -82,26 +86,8 @@ impl PoolingBackend {
         }
         let conn_id = uuid::Uuid::new_v4();
         tracing::Span::current().record("conn_id", display(conn_id));
-        info!("pool: opening a new connection '{conn_info}'");
-        let backend = self
-            .config
-            .auth_backend
-            .as_ref()
-            .map(|_| conn_info.user_info.clone());
-
-        let mut node_info = backend
-            .wake_compute(ctx)
-            .await?
-            .ok_or(HttpConnError::NoComputeInfo)?;
-
-        match keys {
-            #[cfg(any(test, feature = "testing"))]
-            ComputeCredentialKeys::Password(password) => node_info.config.password(password),
-            ComputeCredentialKeys::AuthKeys(auth_keys) => node_info.config.auth_keys(auth_keys),
-        };
-
-        ctx.set_project(node_info.aux.clone());
-
+        info!(%conn_id, "pool: opening a new connection '{conn_info}'");
+        let backend = self.config.auth_backend.as_ref().map(|_| keys);
         crate::proxy::connect_compute::connect_to_compute(
             ctx,
             &TokioMechanism {
@@ -109,8 +95,8 @@ impl PoolingBackend {
                 conn_info,
                 pool: self.pool.clone(),
             },
-            node_info,
             &backend,
+            false, // do not allow self signed compute for http flow
         )
         .await
     }
@@ -129,8 +115,6 @@ pub enum HttpConnError {
     AuthError(#[from] AuthError),
     #[error("wake_compute returned error")]
     WakeCompute(#[from] WakeComputeError),
-    #[error("wake_compute returned nothing")]
-    NoComputeInfo,
 }
 
 struct TokioMechanism {

From c49c9707ced792aaa53d4a27bfe2719322704db4 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Tue, 13 Feb 2024 17:58:58 +0100
Subject: [PATCH 119/209] Proxy: send cancel notifications to all instances
 (#6719)

## Problem

If cancel request ends up on the wrong proxy instance, it doesn't take
an effect.

## Summary of changes

Send redis notifications to all proxy pods about the cancel request.

Related issue: https://github.com/neondatabase/neon/issues/5839,
https://github.com/neondatabase/cloud/issues/10262
---
 Cargo.lock                        |   7 +-
 Cargo.toml                        |   2 +-
 libs/pq_proto/Cargo.toml          |   1 +
 libs/pq_proto/src/lib.rs          |   3 +-
 proxy/src/bin/proxy.rs            |  32 ++++-
 proxy/src/cancellation.rs         | 109 ++++++++++++++---
 proxy/src/config.rs               |   1 +
 proxy/src/metrics.rs              |   9 ++
 proxy/src/proxy.rs                |  16 +--
 proxy/src/rate_limiter.rs         |   2 +-
 proxy/src/rate_limiter/limiter.rs |  38 ++++++
 proxy/src/redis.rs                |   1 +
 proxy/src/redis/notifications.rs  | 197 +++++++++++++++++++++++-------
 proxy/src/redis/publisher.rs      |  80 ++++++++++++
 proxy/src/serverless.rs           |  13 +-
 proxy/src/serverless/websocket.rs |   6 +-
 workspace_hack/Cargo.toml         |   4 +-
 17 files changed, 432 insertions(+), 89 deletions(-)
 create mode 100644 proxy/src/redis/publisher.rs

diff --git a/Cargo.lock b/Cargo.lock
index 83afdaf66f..f5f32e8491 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2247,11 +2247,11 @@ dependencies = [
 
 [[package]]
 name = "hashlink"
-version = "0.8.2"
+version = "0.8.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0761a1b9491c4f2e3d66aa0f62d0fba0af9a0e2852e4d48ea506632a4b56e6aa"
+checksum = "e8094feaf31ff591f651a2664fb9cfd92bba7a60ce3197265e9482ebe753c8f7"
 dependencies = [
- "hashbrown 0.13.2",
+ "hashbrown 0.14.0",
 ]
 
 [[package]]
@@ -3936,6 +3936,7 @@ dependencies = [
  "pin-project-lite",
  "postgres-protocol",
  "rand 0.8.5",
+ "serde",
  "thiserror",
  "tokio",
  "tracing",
diff --git a/Cargo.toml b/Cargo.toml
index ebc3dfa7b1..74dbaf1853 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -80,7 +80,7 @@ futures-core = "0.3"
 futures-util = "0.3"
 git-version = "0.3"
 hashbrown = "0.13"
-hashlink = "0.8.1"
+hashlink = "0.8.4"
 hdrhistogram = "7.5.2"
 hex = "0.4"
 hex-literal = "0.4"
diff --git a/libs/pq_proto/Cargo.toml b/libs/pq_proto/Cargo.toml
index b286eb0358..6eeb3bafef 100644
--- a/libs/pq_proto/Cargo.toml
+++ b/libs/pq_proto/Cargo.toml
@@ -13,5 +13,6 @@ rand.workspace = true
 tokio.workspace = true
 tracing.workspace = true
 thiserror.workspace = true
+serde.workspace = true
 
 workspace_hack.workspace = true
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index c52a21bcd3..522b65f5d1 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -7,6 +7,7 @@ pub mod framed;
 
 use byteorder::{BigEndian, ReadBytesExt};
 use bytes::{Buf, BufMut, Bytes, BytesMut};
+use serde::{Deserialize, Serialize};
 use std::{borrow::Cow, collections::HashMap, fmt, io, str};
 
 // re-export for use in utils pageserver_feedback.rs
@@ -123,7 +124,7 @@ impl StartupMessageParams {
     }
 }
 
-#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy)]
+#[derive(Debug, Hash, PartialEq, Eq, Clone, Copy, Serialize, Deserialize)]
 pub struct CancelKeyData {
     pub backend_pid: i32,
     pub cancel_key: i32,
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 00a229c135..b3d4fc0411 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,6 +1,8 @@
 use futures::future::Either;
 use proxy::auth;
 use proxy::auth::backend::MaybeOwned;
+use proxy::cancellation::CancelMap;
+use proxy::cancellation::CancellationHandler;
 use proxy::config::AuthenticationConfig;
 use proxy::config::CacheOptions;
 use proxy::config::HttpConfig;
@@ -12,6 +14,7 @@ use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
 use proxy::rate_limiter::RateLimiterConfig;
 use proxy::redis::notifications;
+use proxy::redis::publisher::RedisPublisherClient;
 use proxy::serverless::GlobalConnPoolOptions;
 use proxy::usage_metrics;
 
@@ -22,6 +25,7 @@ use std::net::SocketAddr;
 use std::pin::pin;
 use std::sync::Arc;
 use tokio::net::TcpListener;
+use tokio::sync::Mutex;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
@@ -129,6 +133,9 @@ struct ProxyCliArgs {
     /// Can be given multiple times for different bucket sizes.
     #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
     endpoint_rps_limit: Vec<RateBucketInfo>,
+    /// Redis rate limiter max number of requests per second.
+    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_SET)]
+    redis_rps_limit: Vec<RateBucketInfo>,
     /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
     #[clap(long, default_value_t = 100)]
     initial_limit: usize,
@@ -225,6 +232,19 @@ async fn main() -> anyhow::Result<()> {
     let cancellation_token = CancellationToken::new();
 
     let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
+    let cancel_map = CancelMap::default();
+    let redis_publisher = match &args.redis_notifications {
+        Some(url) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
+            url,
+            args.region.clone(),
+            &config.redis_rps_limit,
+        )?))),
+        None => None,
+    };
+    let cancellation_handler = Arc::new(CancellationHandler::new(
+        cancel_map.clone(),
+        redis_publisher,
+    ));
 
     // client facing tasks. these will exit on error or on cancellation
     // cancellation returns Ok(())
@@ -234,6 +254,7 @@ async fn main() -> anyhow::Result<()> {
         proxy_listener,
         cancellation_token.clone(),
         endpoint_rate_limiter.clone(),
+        cancellation_handler.clone(),
     ));
 
     // TODO: rename the argument to something like serverless.
@@ -248,6 +269,7 @@ async fn main() -> anyhow::Result<()> {
             serverless_listener,
             cancellation_token.clone(),
             endpoint_rate_limiter.clone(),
+            cancellation_handler.clone(),
         ));
     }
 
@@ -271,7 +293,12 @@ async fn main() -> anyhow::Result<()> {
             let cache = api.caches.project_info.clone();
             if let Some(url) = args.redis_notifications {
                 info!("Starting redis notifications listener ({url})");
-                maintenance_tasks.spawn(notifications::task_main(url.to_owned(), cache.clone()));
+                maintenance_tasks.spawn(notifications::task_main(
+                    url.to_owned(),
+                    cache.clone(),
+                    cancel_map.clone(),
+                    args.region.clone(),
+                ));
             }
             maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
         }
@@ -403,6 +430,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
 
     let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
     RateBucketInfo::validate(&mut endpoint_rps_limit)?;
+    let mut redis_rps_limit = args.redis_rps_limit.clone();
+    RateBucketInfo::validate(&mut redis_rps_limit)?;
 
     let config = Box::leak(Box::new(ProxyConfig {
         tls_config,
@@ -414,6 +443,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         require_client_ip: args.require_client_ip,
         disable_ip_check_for_http: args.disable_ip_check_for_http,
         endpoint_rps_limit,
+        redis_rps_limit,
         handshake_timeout: args.handshake_timeout,
         // TODO: add this argument
         region: args.region.clone(),
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index fe614628d8..93a77bc4ae 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,16 +1,28 @@
+use async_trait::async_trait;
 use dashmap::DashMap;
 use pq_proto::CancelKeyData;
 use std::{net::SocketAddr, sync::Arc};
 use thiserror::Error;
 use tokio::net::TcpStream;
+use tokio::sync::Mutex;
 use tokio_postgres::{CancelToken, NoTls};
 use tracing::info;
+use uuid::Uuid;
 
-use crate::error::ReportableError;
+use crate::{
+    error::ReportableError, metrics::NUM_CANCELLATION_REQUESTS,
+    redis::publisher::RedisPublisherClient,
+};
+
+pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
 
 /// Enables serving `CancelRequest`s.
-#[derive(Default)]
-pub struct CancelMap(DashMap<CancelKeyData, Option<CancelClosure>>);
+///
+/// If there is a `RedisPublisherClient` available, it will be used to publish the cancellation key to other proxy instances.
+pub struct CancellationHandler {
+    map: CancelMap,
+    redis_client: Option<Arc<Mutex<RedisPublisherClient>>>,
+}
 
 #[derive(Debug, Error)]
 pub enum CancelError {
@@ -32,15 +44,43 @@ impl ReportableError for CancelError {
     }
 }
 
-impl CancelMap {
+impl CancellationHandler {
+    pub fn new(map: CancelMap, redis_client: Option<Arc<Mutex<RedisPublisherClient>>>) -> Self {
+        Self { map, redis_client }
+    }
     /// Cancel a running query for the corresponding connection.
-    pub async fn cancel_session(&self, key: CancelKeyData) -> Result<(), CancelError> {
+    pub async fn cancel_session(
+        &self,
+        key: CancelKeyData,
+        session_id: Uuid,
+    ) -> Result<(), CancelError> {
+        let from = "from_client";
         // NB: we should immediately release the lock after cloning the token.
-        let Some(cancel_closure) = self.0.get(&key).and_then(|x| x.clone()) else {
+        let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
             tracing::warn!("query cancellation key not found: {key}");
+            if let Some(redis_client) = &self.redis_client {
+                NUM_CANCELLATION_REQUESTS
+                    .with_label_values(&[from, "not_found"])
+                    .inc();
+                info!("publishing cancellation key to Redis");
+                match redis_client.lock().await.try_publish(key, session_id).await {
+                    Ok(()) => {
+                        info!("cancellation key successfuly published to Redis");
+                    }
+                    Err(e) => {
+                        tracing::error!("failed to publish a message: {e}");
+                        return Err(CancelError::IO(std::io::Error::new(
+                            std::io::ErrorKind::Other,
+                            e.to_string(),
+                        )));
+                    }
+                }
+            }
             return Ok(());
         };
-
+        NUM_CANCELLATION_REQUESTS
+            .with_label_values(&[from, "found"])
+            .inc();
         info!("cancelling query per user's request using key {key}");
         cancel_closure.try_cancel_query().await
     }
@@ -57,7 +97,7 @@ impl CancelMap {
 
             // Random key collisions are unlikely to happen here, but they're still possible,
             // which is why we have to take care not to rewrite an existing key.
-            match self.0.entry(key) {
+            match self.map.entry(key) {
                 dashmap::mapref::entry::Entry::Occupied(_) => continue,
                 dashmap::mapref::entry::Entry::Vacant(e) => {
                     e.insert(None);
@@ -69,18 +109,46 @@ impl CancelMap {
         info!("registered new query cancellation key {key}");
         Session {
             key,
-            cancel_map: self,
+            cancellation_handler: self,
         }
     }
 
     #[cfg(test)]
     fn contains(&self, session: &Session) -> bool {
-        self.0.contains_key(&session.key)
+        self.map.contains_key(&session.key)
     }
 
     #[cfg(test)]
     fn is_empty(&self) -> bool {
-        self.0.is_empty()
+        self.map.is_empty()
+    }
+}
+
+#[async_trait]
+pub trait NotificationsCancellationHandler {
+    async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError>;
+}
+
+#[async_trait]
+impl NotificationsCancellationHandler for CancellationHandler {
+    async fn cancel_session_no_publish(&self, key: CancelKeyData) -> Result<(), CancelError> {
+        let from = "from_redis";
+        let cancel_closure = self.map.get(&key).and_then(|x| x.clone());
+        match cancel_closure {
+            Some(cancel_closure) => {
+                NUM_CANCELLATION_REQUESTS
+                    .with_label_values(&[from, "found"])
+                    .inc();
+                cancel_closure.try_cancel_query().await
+            }
+            None => {
+                NUM_CANCELLATION_REQUESTS
+                    .with_label_values(&[from, "not_found"])
+                    .inc();
+                tracing::warn!("query cancellation key not found: {key}");
+                Ok(())
+            }
+        }
     }
 }
 
@@ -115,7 +183,7 @@ pub struct Session {
     /// The user-facing key identifying this session.
     key: CancelKeyData,
     /// The [`CancelMap`] this session belongs to.
-    cancel_map: Arc<CancelMap>,
+    cancellation_handler: Arc<CancellationHandler>,
 }
 
 impl Session {
@@ -123,7 +191,9 @@ impl Session {
     /// This enables query cancellation in `crate::proxy::prepare_client_connection`.
     pub fn enable_query_cancellation(&self, cancel_closure: CancelClosure) -> CancelKeyData {
         info!("enabling query cancellation for this session");
-        self.cancel_map.0.insert(self.key, Some(cancel_closure));
+        self.cancellation_handler
+            .map
+            .insert(self.key, Some(cancel_closure));
 
         self.key
     }
@@ -131,7 +201,7 @@ impl Session {
 
 impl Drop for Session {
     fn drop(&mut self) {
-        self.cancel_map.0.remove(&self.key);
+        self.cancellation_handler.map.remove(&self.key);
         info!("dropped query cancellation key {}", &self.key);
     }
 }
@@ -142,13 +212,16 @@ mod tests {
 
     #[tokio::test]
     async fn check_session_drop() -> anyhow::Result<()> {
-        let cancel_map: Arc<CancelMap> = Default::default();
+        let cancellation_handler = Arc::new(CancellationHandler {
+            map: CancelMap::default(),
+            redis_client: None,
+        });
 
-        let session = cancel_map.clone().get_session();
-        assert!(cancel_map.contains(&session));
+        let session = cancellation_handler.clone().get_session();
+        assert!(cancellation_handler.contains(&session));
         drop(session);
         // Check that the session has been dropped.
-        assert!(cancel_map.is_empty());
+        assert!(cancellation_handler.is_empty());
 
         Ok(())
     }
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 5fcb537834..9f276c3c24 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -21,6 +21,7 @@ pub struct ProxyConfig {
     pub require_client_ip: bool,
     pub disable_ip_check_for_http: bool,
     pub endpoint_rps_limit: Vec<RateBucketInfo>,
+    pub redis_rps_limit: Vec<RateBucketInfo>,
     pub region: String,
     pub handshake_timeout: Duration,
 }
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index f7f162a075..66031f5eb2 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -152,6 +152,15 @@ pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy<IntGauge> = Lazy::new(|| {
     .unwrap()
 });
 
+pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "proxy_cancellation_requests_total",
+        "Number of cancellation requests (per found/not_found).",
+        &["source", "kind"],
+    )
+    .unwrap()
+});
+
 #[derive(Clone)]
 pub struct LatencyTimer {
     // time since the stopwatch was started
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 5f65de4c98..ce77098a5f 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -10,7 +10,7 @@ pub mod wake_compute;
 
 use crate::{
     auth,
-    cancellation::{self, CancelMap},
+    cancellation::{self, CancellationHandler},
     compute,
     config::{ProxyConfig, TlsConfig},
     context::RequestMonitoring,
@@ -62,6 +62,7 @@ pub async fn task_main(
     listener: tokio::net::TcpListener,
     cancellation_token: CancellationToken,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    cancellation_handler: Arc<CancellationHandler>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("proxy has shut down");
@@ -72,7 +73,6 @@ pub async fn task_main(
     socket2::SockRef::from(&listener).set_keepalive(true)?;
 
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
-    let cancel_map = Arc::new(CancelMap::default());
 
     while let Some(accept_result) =
         run_until_cancelled(listener.accept(), &cancellation_token).await
@@ -80,7 +80,7 @@ pub async fn task_main(
         let (socket, peer_addr) = accept_result?;
 
         let session_id = uuid::Uuid::new_v4();
-        let cancel_map = Arc::clone(&cancel_map);
+        let cancellation_handler = Arc::clone(&cancellation_handler);
         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
 
         let session_span = info_span!(
@@ -113,7 +113,7 @@ pub async fn task_main(
                 let res = handle_client(
                     config,
                     &mut ctx,
-                    cancel_map,
+                    cancellation_handler,
                     socket,
                     ClientMode::Tcp,
                     endpoint_rate_limiter,
@@ -227,7 +227,7 @@ impl ReportableError for ClientRequestError {
 pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     config: &'static ProxyConfig,
     ctx: &mut RequestMonitoring,
-    cancel_map: Arc<CancelMap>,
+    cancellation_handler: Arc<CancellationHandler>,
     stream: S,
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -253,8 +253,8 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(cancel_key_data) => {
-                return Ok(cancel_map
-                    .cancel_session(cancel_key_data)
+                return Ok(cancellation_handler
+                    .cancel_session(cancel_key_data, ctx.session_id)
                     .await
                     .map(|()| None)?)
             }
@@ -315,7 +315,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     .or_else(|e| stream.throw_error(e))
     .await?;
 
-    let session = cancel_map.get_session();
+    let session = cancellation_handler.get_session();
     prepare_client_connection(&node, &session, &mut stream).await?;
 
     // Before proxy passing, forward to compute whatever data is left in the
diff --git a/proxy/src/rate_limiter.rs b/proxy/src/rate_limiter.rs
index b26386d159..f0da4ead23 100644
--- a/proxy/src/rate_limiter.rs
+++ b/proxy/src/rate_limiter.rs
@@ -4,4 +4,4 @@ mod limiter;
 pub use aimd::Aimd;
 pub use limit_algorithm::{AimdConfig, Fixed, RateLimitAlgorithm, RateLimiterConfig};
 pub use limiter::Limiter;
-pub use limiter::{EndpointRateLimiter, RateBucketInfo};
+pub use limiter::{EndpointRateLimiter, RateBucketInfo, RedisRateLimiter};
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index cbae72711c..3181060e2f 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -22,6 +22,44 @@ use super::{
     RateLimiterConfig,
 };
 
+pub struct RedisRateLimiter {
+    data: Vec<RateBucket>,
+    info: &'static [RateBucketInfo],
+}
+
+impl RedisRateLimiter {
+    pub fn new(info: &'static [RateBucketInfo]) -> Self {
+        Self {
+            data: vec![
+                RateBucket {
+                    start: Instant::now(),
+                    count: 0,
+                };
+                info.len()
+            ],
+            info,
+        }
+    }
+
+    /// Check that number of connections is below `max_rps` rps.
+    pub fn check(&mut self) -> bool {
+        let now = Instant::now();
+
+        let should_allow_request = self
+            .data
+            .iter_mut()
+            .zip(self.info)
+            .all(|(bucket, info)| bucket.should_allow_request(info, now));
+
+        if should_allow_request {
+            // only increment the bucket counts if the request will actually be accepted
+            self.data.iter_mut().for_each(RateBucket::inc);
+        }
+
+        should_allow_request
+    }
+}
+
 // Simple per-endpoint rate limiter.
 //
 // Check that number of connections to the endpoint is below `max_rps` rps.
diff --git a/proxy/src/redis.rs b/proxy/src/redis.rs
index c2a91bed97..35d6db074e 100644
--- a/proxy/src/redis.rs
+++ b/proxy/src/redis.rs
@@ -1 +1,2 @@
 pub mod notifications;
+pub mod publisher;
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 158884aa17..b8297a206c 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -1,38 +1,44 @@
 use std::{convert::Infallible, sync::Arc};
 
 use futures::StreamExt;
+use pq_proto::CancelKeyData;
 use redis::aio::PubSub;
-use serde::Deserialize;
+use serde::{Deserialize, Serialize};
+use uuid::Uuid;
 
 use crate::{
     cache::project_info::ProjectInfoCache,
+    cancellation::{CancelMap, CancellationHandler, NotificationsCancellationHandler},
     intern::{ProjectIdInt, RoleNameInt},
 };
 
-const CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
+const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
+pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
 const RECONNECT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20);
 const INVALIDATION_LAG: std::time::Duration = std::time::Duration::from_secs(20);
 
-struct ConsoleRedisClient {
+struct RedisConsumerClient {
     client: redis::Client,
 }
 
-impl ConsoleRedisClient {
+impl RedisConsumerClient {
     pub fn new(url: &str) -> anyhow::Result<Self> {
         let client = redis::Client::open(url)?;
         Ok(Self { client })
     }
     async fn try_connect(&self) -> anyhow::Result<PubSub> {
         let mut conn = self.client.get_async_connection().await?.into_pubsub();
-        tracing::info!("subscribing to a channel `{CHANNEL_NAME}`");
-        conn.subscribe(CHANNEL_NAME).await?;
+        tracing::info!("subscribing to a channel `{CPLANE_CHANNEL_NAME}`");
+        conn.subscribe(CPLANE_CHANNEL_NAME).await?;
+        tracing::info!("subscribing to a channel `{PROXY_CHANNEL_NAME}`");
+        conn.subscribe(PROXY_CHANNEL_NAME).await?;
         Ok(conn)
     }
 }
 
-#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
 #[serde(tag = "topic", content = "data")]
-enum Notification {
+pub(crate) enum Notification {
     #[serde(
         rename = "/allowed_ips_updated",
         deserialize_with = "deserialize_json_string"
@@ -45,16 +51,25 @@ enum Notification {
         deserialize_with = "deserialize_json_string"
     )]
     PasswordUpdate { password_update: PasswordUpdate },
+    #[serde(rename = "/cancel_session")]
+    Cancel(CancelSession),
 }
-#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
-struct AllowedIpsUpdate {
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub(crate) struct AllowedIpsUpdate {
     project_id: ProjectIdInt,
 }
-#[derive(Clone, Debug, Deserialize, Eq, PartialEq)]
-struct PasswordUpdate {
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub(crate) struct PasswordUpdate {
     project_id: ProjectIdInt,
     role_name: RoleNameInt,
 }
+#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
+pub(crate) struct CancelSession {
+    pub region_id: Option<String>,
+    pub cancel_key_data: CancelKeyData,
+    pub session_id: Uuid,
+}
+
 fn deserialize_json_string<'de, D, T>(deserializer: D) -> Result<T, D::Error>
 where
     T: for<'de2> serde::Deserialize<'de2>,
@@ -64,6 +79,88 @@ where
     serde_json::from_str(&s).map_err(<D::Error as serde::de::Error>::custom)
 }
 
+struct MessageHandler<
+    C: ProjectInfoCache + Send + Sync + 'static,
+    H: NotificationsCancellationHandler + Send + Sync + 'static,
+> {
+    cache: Arc<C>,
+    cancellation_handler: Arc<H>,
+    region_id: String,
+}
+
+impl<
+        C: ProjectInfoCache + Send + Sync + 'static,
+        H: NotificationsCancellationHandler + Send + Sync + 'static,
+    > MessageHandler<C, H>
+{
+    pub fn new(cache: Arc<C>, cancellation_handler: Arc<H>, region_id: String) -> Self {
+        Self {
+            cache,
+            cancellation_handler,
+            region_id,
+        }
+    }
+    pub fn disable_ttl(&self) {
+        self.cache.disable_ttl();
+    }
+    pub fn enable_ttl(&self) {
+        self.cache.enable_ttl();
+    }
+    #[tracing::instrument(skip(self, msg), fields(session_id = tracing::field::Empty))]
+    async fn handle_message(&self, msg: redis::Msg) -> anyhow::Result<()> {
+        use Notification::*;
+        let payload: String = msg.get_payload()?;
+        tracing::debug!(?payload, "received a message payload");
+
+        let msg: Notification = match serde_json::from_str(&payload) {
+            Ok(msg) => msg,
+            Err(e) => {
+                tracing::error!("broken message: {e}");
+                return Ok(());
+            }
+        };
+        tracing::debug!(?msg, "received a message");
+        match msg {
+            Cancel(cancel_session) => {
+                tracing::Span::current().record(
+                    "session_id",
+                    &tracing::field::display(cancel_session.session_id),
+                );
+                if let Some(cancel_region) = cancel_session.region_id {
+                    // If the message is not for this region, ignore it.
+                    if cancel_region != self.region_id {
+                        return Ok(());
+                    }
+                }
+                // This instance of cancellation_handler doesn't have a RedisPublisherClient so it can't publish the message.
+                match self
+                    .cancellation_handler
+                    .cancel_session_no_publish(cancel_session.cancel_key_data)
+                    .await
+                {
+                    Ok(()) => {}
+                    Err(e) => {
+                        tracing::error!("failed to cancel session: {e}");
+                    }
+                }
+            }
+            _ => {
+                invalidate_cache(self.cache.clone(), msg.clone());
+                // It might happen that the invalid entry is on the way to be cached.
+                // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
+                // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message.
+                let cache = self.cache.clone();
+                tokio::spawn(async move {
+                    tokio::time::sleep(INVALIDATION_LAG).await;
+                    invalidate_cache(cache, msg);
+                });
+            }
+        }
+
+        Ok(())
+    }
+}
+
 fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
     use Notification::*;
     match msg {
@@ -74,50 +171,33 @@ fn invalidate_cache<C: ProjectInfoCache>(cache: Arc<C>, msg: Notification) {
             password_update.project_id,
             password_update.role_name,
         ),
+        Cancel(_) => unreachable!("cancel message should be handled separately"),
     }
 }
 
-#[tracing::instrument(skip(cache))]
-fn handle_message<C>(msg: redis::Msg, cache: Arc<C>) -> anyhow::Result<()>
-where
-    C: ProjectInfoCache + Send + Sync + 'static,
-{
-    let payload: String = msg.get_payload()?;
-    tracing::debug!(?payload, "received a message payload");
-
-    let msg: Notification = match serde_json::from_str(&payload) {
-        Ok(msg) => msg,
-        Err(e) => {
-            tracing::error!("broken message: {e}");
-            return Ok(());
-        }
-    };
-    tracing::debug!(?msg, "received a message");
-    invalidate_cache(cache.clone(), msg.clone());
-    // It might happen that the invalid entry is on the way to be cached.
-    // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
-    // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message.
-    tokio::spawn(async move {
-        tokio::time::sleep(INVALIDATION_LAG).await;
-        invalidate_cache(cache, msg.clone());
-    });
-
-    Ok(())
-}
-
 /// Handle console's invalidation messages.
 #[tracing::instrument(name = "console_notifications", skip_all)]
-pub async fn task_main<C>(url: String, cache: Arc<C>) -> anyhow::Result<Infallible>
+pub async fn task_main<C>(
+    url: String,
+    cache: Arc<C>,
+    cancel_map: CancelMap,
+    region_id: String,
+) -> anyhow::Result<Infallible>
 where
     C: ProjectInfoCache + Send + Sync + 'static,
 {
     cache.enable_ttl();
+    let handler = MessageHandler::new(
+        cache,
+        Arc::new(CancellationHandler::new(cancel_map, None)),
+        region_id,
+    );
 
     loop {
-        let redis = ConsoleRedisClient::new(&url)?;
+        let redis = RedisConsumerClient::new(&url)?;
         let conn = match redis.try_connect().await {
             Ok(conn) => {
-                cache.disable_ttl();
+                handler.disable_ttl();
                 conn
             }
             Err(e) => {
@@ -130,7 +210,7 @@ where
         };
         let mut stream = conn.into_on_message();
         while let Some(msg) = stream.next().await {
-            match handle_message(msg, cache.clone()) {
+            match handler.handle_message(msg).await {
                 Ok(()) => {}
                 Err(e) => {
                     tracing::error!("failed to handle message: {e}, will try to reconnect");
@@ -138,7 +218,7 @@ where
                 }
             }
         }
-        cache.enable_ttl();
+        handler.enable_ttl();
     }
 }
 
@@ -198,6 +278,33 @@ mod tests {
             }
         );
 
+        Ok(())
+    }
+    #[test]
+    fn parse_cancel_session() -> anyhow::Result<()> {
+        let cancel_key_data = CancelKeyData {
+            backend_pid: 42,
+            cancel_key: 41,
+        };
+        let uuid = uuid::Uuid::new_v4();
+        let msg = Notification::Cancel(CancelSession {
+            cancel_key_data,
+            region_id: None,
+            session_id: uuid,
+        });
+        let text = serde_json::to_string(&msg)?;
+        let result: Notification = serde_json::from_str(&text)?;
+        assert_eq!(msg, result);
+
+        let msg = Notification::Cancel(CancelSession {
+            cancel_key_data,
+            region_id: Some("region".to_string()),
+            session_id: uuid,
+        });
+        let text = serde_json::to_string(&msg)?;
+        let result: Notification = serde_json::from_str(&text)?;
+        assert_eq!(msg, result,);
+
         Ok(())
     }
 }
diff --git a/proxy/src/redis/publisher.rs b/proxy/src/redis/publisher.rs
new file mode 100644
index 0000000000..f85593afdd
--- /dev/null
+++ b/proxy/src/redis/publisher.rs
@@ -0,0 +1,80 @@
+use pq_proto::CancelKeyData;
+use redis::AsyncCommands;
+use uuid::Uuid;
+
+use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
+
+use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME};
+
+pub struct RedisPublisherClient {
+    client: redis::Client,
+    publisher: Option<redis::aio::Connection>,
+    region_id: String,
+    limiter: RedisRateLimiter,
+}
+
+impl RedisPublisherClient {
+    pub fn new(
+        url: &str,
+        region_id: String,
+        info: &'static [RateBucketInfo],
+    ) -> anyhow::Result<Self> {
+        let client = redis::Client::open(url)?;
+        Ok(Self {
+            client,
+            publisher: None,
+            region_id,
+            limiter: RedisRateLimiter::new(info),
+        })
+    }
+    pub async fn try_publish(
+        &mut self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        if !self.limiter.check() {
+            tracing::info!("Rate limit exceeded. Skipping cancellation message");
+            return Err(anyhow::anyhow!("Rate limit exceeded"));
+        }
+        match self.publish(cancel_key_data, session_id).await {
+            Ok(()) => return Ok(()),
+            Err(e) => {
+                tracing::error!("failed to publish a message: {e}");
+                self.publisher = None;
+            }
+        }
+        tracing::info!("Publisher is disconnected. Reconnectiong...");
+        self.try_connect().await?;
+        self.publish(cancel_key_data, session_id).await
+    }
+
+    async fn publish(
+        &mut self,
+        cancel_key_data: CancelKeyData,
+        session_id: Uuid,
+    ) -> anyhow::Result<()> {
+        let conn = self
+            .publisher
+            .as_mut()
+            .ok_or_else(|| anyhow::anyhow!("not connected"))?;
+        let payload = serde_json::to_string(&Notification::Cancel(CancelSession {
+            region_id: Some(self.region_id.clone()),
+            cancel_key_data,
+            session_id,
+        }))?;
+        conn.publish(PROXY_CHANNEL_NAME, payload).await?;
+        Ok(())
+    }
+    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
+        match self.client.get_async_connection().await {
+            Ok(conn) => {
+                self.publisher = Some(conn);
+            }
+            Err(e) => {
+                tracing::error!("failed to connect to redis: {e}");
+                return Err(e.into());
+            }
+        }
+        Ok(())
+    }
+}
diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs
index a20600b94a..ee3e91495b 100644
--- a/proxy/src/serverless.rs
+++ b/proxy/src/serverless.rs
@@ -24,7 +24,7 @@ use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE;
 use crate::protocol2::{ProxyProtocolAccept, WithClientIp};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
-use crate::{cancellation::CancelMap, config::ProxyConfig};
+use crate::{cancellation::CancellationHandler, config::ProxyConfig};
 use futures::StreamExt;
 use hyper::{
     server::{
@@ -50,6 +50,7 @@ pub async fn task_main(
     ws_listener: TcpListener,
     cancellation_token: CancellationToken,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    cancellation_handler: Arc<CancellationHandler>,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("websocket server has shut down");
@@ -115,7 +116,7 @@ pub async fn task_main(
             let backend = backend.clone();
             let ws_connections = ws_connections.clone();
             let endpoint_rate_limiter = endpoint_rate_limiter.clone();
-
+            let cancellation_handler = cancellation_handler.clone();
             async move {
                 let peer_addr = match client_addr {
                     Some(addr) => addr,
@@ -127,9 +128,9 @@ pub async fn task_main(
                         let backend = backend.clone();
                         let ws_connections = ws_connections.clone();
                         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
+                        let cancellation_handler = cancellation_handler.clone();
 
                         async move {
-                            let cancel_map = Arc::new(CancelMap::default());
                             let session_id = uuid::Uuid::new_v4();
 
                             request_handler(
@@ -137,7 +138,7 @@ pub async fn task_main(
                                 config,
                                 backend,
                                 ws_connections,
-                                cancel_map,
+                                cancellation_handler,
                                 session_id,
                                 peer_addr.ip(),
                                 endpoint_rate_limiter,
@@ -205,7 +206,7 @@ async fn request_handler(
     config: &'static ProxyConfig,
     backend: Arc<PoolingBackend>,
     ws_connections: TaskTracker,
-    cancel_map: Arc<CancelMap>,
+    cancellation_handler: Arc<CancellationHandler>,
     session_id: uuid::Uuid,
     peer_addr: IpAddr,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -232,7 +233,7 @@ async fn request_handler(
                     config,
                     ctx,
                     websocket,
-                    cancel_map,
+                    cancellation_handler,
                     host,
                     endpoint_rate_limiter,
                 )
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 062dd440b2..24f2bb7e8c 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -1,5 +1,5 @@
 use crate::{
-    cancellation::CancelMap,
+    cancellation::CancellationHandler,
     config::ProxyConfig,
     context::RequestMonitoring,
     error::{io_error, ReportableError},
@@ -133,7 +133,7 @@ pub async fn serve_websocket(
     config: &'static ProxyConfig,
     mut ctx: RequestMonitoring,
     websocket: HyperWebsocket,
-    cancel_map: Arc<CancelMap>,
+    cancellation_handler: Arc<CancellationHandler>,
     hostname: Option<String>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
 ) -> anyhow::Result<()> {
@@ -141,7 +141,7 @@ pub async fn serve_websocket(
     let res = handle_client(
         config,
         &mut ctx,
-        cancel_map,
+        cancellation_handler,
         WebSocketRw::new(websocket),
         ClientMode::Websockets { hostname },
         endpoint_rate_limiter,
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 8e9cc43152..e808fabbe7 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -38,7 +38,7 @@ futures-io = { version = "0.3" }
 futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }
+hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] }
 hashbrown-594e8ee84c453af0 = { package = "hashbrown", version = "0.13", features = ["raw"] }
 hex = { version = "0.4", features = ["serde"] }
 hmac = { version = "0.12", default-features = false, features = ["reset"] }
@@ -91,7 +91,7 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
 either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
-hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", default-features = false, features = ["raw"] }
+hashbrown-582f2526e08bb6a0 = { package = "hashbrown", version = "0.14", features = ["raw"] }
 indexmap = { version = "1", default-features = false, features = ["std"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }

From 855d7b478156fbaacdc3b841e98671590c16b051 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 14 Feb 2024 10:26:32 +0000
Subject: [PATCH 120/209] hold cancel session (#6750)

## Problem

In a recent refactor, we accidentally dropped the cancel session early

## Summary of changes

Hold the cancel session during proxy passthrough
---
 proxy/src/proxy.rs             | 1 +
 proxy/src/proxy/passthrough.rs | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index ce77098a5f..8a9445303a 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -331,6 +331,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         compute: node,
         req: _request_gauge,
         conn: _client_gauge,
+        cancel: session,
     }))
 }
 
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index c98f68d8d1..73c170fc0b 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -1,4 +1,5 @@
 use crate::{
+    cancellation,
     compute::PostgresConnection,
     console::messages::MetricsAuxInfo,
     metrics::NUM_BYTES_PROXIED_COUNTER,
@@ -57,6 +58,7 @@ pub struct ProxyPassthrough<S> {
 
     pub req: IntCounterPairGuard,
     pub conn: IntCounterPairGuard,
+    pub cancel: cancellation::Session,
 }
 
 impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {

From edc691647d636408713b32d2a4a6a06cb737e345 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Wed, 14 Feb 2024 19:43:52 +0100
Subject: [PATCH 121/209] Proxy: remove fail fast logic to connect to compute
 (#6759)

## Problem

Flaky tests

## Summary of changes

Remove failfast logic
---
 proxy/src/proxy/connect_compute.rs | 35 ++++++++++++++---------------
 proxy/src/proxy/tests.rs           | 36 ------------------------------
 2 files changed, 17 insertions(+), 54 deletions(-)

diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 6e57caf998..c76e2ff6d9 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -122,25 +122,24 @@ where
 
     error!(error = ?err, "could not connect to compute node");
 
-    let node_info =
-        if err.get_error_kind() == crate::error::ErrorKind::Postgres || !node_info.cached() {
-            // If the error is Postgres, that means that we managed to connect to the compute node, but there was an error.
-            // Do not need to retrieve a new node_info, just return the old one.
-            if !err.should_retry(num_retries) {
-                return Err(err.into());
-            }
-            node_info
-        } else {
-            // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
-            info!("compute node's state has likely changed; requesting a wake-up");
-            ctx.latency_timer.cache_miss();
-            let old_node_info = invalidate_cache(node_info);
-            let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
-            node_info.reuse_settings(old_node_info);
+    let node_info = if !node_info.cached() {
+        // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
+        // Do not need to retrieve a new node_info, just return the old one.
+        if !err.should_retry(num_retries) {
+            return Err(err.into());
+        }
+        node_info
+    } else {
+        // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
+        info!("compute node's state has likely changed; requesting a wake-up");
+        ctx.latency_timer.cache_miss();
+        let old_node_info = invalidate_cache(node_info);
+        let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+        node_info.reuse_settings(old_node_info);
 
-            mechanism.update_connect_config(&mut node_info.config);
-            node_info
-        };
+        mechanism.update_connect_config(&mut node_info.config);
+        node_info
+    };
 
     // now that we have a new node, try connect to it repeatedly.
     // this can error for a few reasons, for instance:
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index efbd661bbf..1a01f32339 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -375,8 +375,6 @@ enum ConnectAction {
     Connect,
     Retry,
     Fail,
-    RetryPg,
-    FailPg,
 }
 
 #[derive(Clone)]
@@ -466,14 +464,6 @@ impl ConnectMechanism for TestConnectMechanism {
                 retryable: false,
                 kind: ErrorKind::Compute,
             }),
-            ConnectAction::FailPg => Err(TestConnectError {
-                retryable: false,
-                kind: ErrorKind::Postgres,
-            }),
-            ConnectAction::RetryPg => Err(TestConnectError {
-                retryable: true,
-                kind: ErrorKind::Postgres,
-            }),
             x => panic!("expecting action {:?}, connect is called instead", x),
         }
     }
@@ -572,32 +562,6 @@ async fn connect_to_compute_retry() {
     mechanism.verify();
 }
 
-#[tokio::test]
-async fn connect_to_compute_retry_pg() {
-    let _ = env_logger::try_init();
-    use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Wake, RetryPg, Connect]);
-    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
-        .await
-        .unwrap();
-    mechanism.verify();
-}
-
-#[tokio::test]
-async fn connect_to_compute_fail_pg() {
-    let _ = env_logger::try_init();
-    use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Wake, FailPg]);
-    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
-        .await
-        .unwrap_err();
-    mechanism.verify();
-}
-
 /// Test that we don't retry if the error is not retryable.
 #[tokio::test]
 async fn connect_to_compute_non_retry_1() {

From 96a4e8de660be469fb00efd7d268120890ca06fd Mon Sep 17 00:00:00 2001
From: Nikita Kalyanov <44959448+nikitakalyanov@users.noreply.github.com>
Date: Thu, 22 Feb 2024 11:51:19 +0200
Subject: [PATCH 122/209] Add /terminate API (#6745) (#6853)

this is to speed up suspends, see
https://github.com/neondatabase/cloud/issues/10284


Cherry-pick to release branch to build new compute images
---
 compute_tools/src/bin/compute_ctl.rs     | 25 +++++------
 compute_tools/src/compute.rs             | 16 +++++++
 compute_tools/src/http/api.rs            | 55 ++++++++++++++++++++++++
 compute_tools/src/http/openapi_spec.yaml | 23 ++++++++++
 control_plane/src/endpoint.rs            |  4 +-
 libs/compute_api/src/responses.rs        |  4 ++
 6 files changed, 114 insertions(+), 13 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index a7e10d0aee..117919786e 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -45,7 +45,6 @@ use std::{thread, time::Duration};
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
-use nix::sys::signal::{kill, Signal};
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info};
@@ -53,7 +52,9 @@ use url::Url;
 
 use compute_api::responses::ComputeStatus;
 
-use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
+use compute_tools::compute::{
+    forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
+};
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version;
 use compute_tools::http::api::launch_http_server;
@@ -394,6 +395,15 @@ fn main() -> Result<()> {
         info!("synced safekeepers at lsn {lsn}");
     }
 
+    let mut state = compute.state.lock().unwrap();
+    if state.status == ComputeStatus::TerminationPending {
+        state.status = ComputeStatus::Terminated;
+        compute.state_changed.notify_all();
+        // we were asked to terminate gracefully, don't exit to avoid restart
+        delay_exit = true
+    }
+    drop(state);
+
     if let Err(err) = compute.check_for_core_dumps() {
         error!("error while checking for core dumps: {err:?}");
     }
@@ -523,16 +533,7 @@ fn cli() -> clap::Command {
 /// wait for termination which would be easy then.
 fn handle_exit_signal(sig: i32) {
     info!("received {sig} termination signal");
-    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
-    if ss_pid != 0 {
-        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
-        kill(ss_pid, Signal::SIGTERM).ok();
-    }
-    let pg_pid = PG_PID.load(Ordering::SeqCst);
-    if pg_pid != 0 {
-        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        kill(pg_pid, Signal::SIGTERM).ok();
-    }
+    forward_termination_signal();
     exit(1);
 }
 
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 1c5363d048..142bb14fe5 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -28,6 +28,8 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;
 
+use nix::sys::signal::{kill, Signal};
+
 use remote_storage::{DownloadError, RemotePath};
 
 use crate::checker::create_availability_check_data;
@@ -1322,3 +1324,17 @@ LIMIT 100",
         Ok(remote_ext_metrics)
     }
 }
+
+pub fn forward_termination_signal() {
+    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
+    if ss_pid != 0 {
+        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
+        kill(ss_pid, Signal::SIGTERM).ok();
+    }
+    let pg_pid = PG_PID.load(Ordering::SeqCst);
+    if pg_pid != 0 {
+        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
+        // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
+        kill(pg_pid, Signal::SIGQUIT).ok();
+    }
+}
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index fa2c4cff28..f076951239 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -5,6 +5,7 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;
 
+use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
@@ -123,6 +124,17 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             }
         }
 
+        (&Method::POST, "/terminate") => {
+            info!("serving /terminate POST request");
+            match handle_terminate_request(compute).await {
+                Ok(()) => Response::new(Body::empty()),
+                Err((msg, code)) => {
+                    error!("error handling /terminate request: {msg}");
+                    render_json_error(&msg, code)
+                }
+            }
+        }
+
         // download extension files from remote extension storage on demand
         (&Method::POST, route) if route.starts_with("/extension_server/") => {
             info!("serving {:?} POST request", route);
@@ -297,6 +309,49 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
         .unwrap()
 }
 
+async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
+    {
+        let mut state = compute.state.lock().unwrap();
+        if state.status == ComputeStatus::Terminated {
+            return Ok(());
+        }
+        if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
+            let msg = format!(
+                "invalid compute status for termination request: {:?}",
+                state.status.clone()
+            );
+            return Err((msg, StatusCode::PRECONDITION_FAILED));
+        }
+        state.status = ComputeStatus::TerminationPending;
+        compute.state_changed.notify_all();
+        drop(state);
+    }
+    forward_termination_signal();
+    info!("sent signal and notified waiters");
+
+    // Spawn a blocking thread to wait for compute to become Terminated.
+    // This is needed to do not block the main pool of workers and
+    // be able to serve other requests while some particular request
+    // is waiting for compute to finish configuration.
+    let c = compute.clone();
+    task::spawn_blocking(move || {
+        let mut state = c.state.lock().unwrap();
+        while state.status != ComputeStatus::Terminated {
+            state = c.state_changed.wait(state).unwrap();
+            info!(
+                "waiting for compute to become Terminated, current status: {:?}",
+                state.status
+            );
+        }
+
+        Ok(())
+    })
+    .await
+    .unwrap()?;
+    info!("terminated Postgres");
+    Ok(())
+}
+
 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
 async fn serve(port: u16, state: Arc<ComputeNode>) {
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index cedc6ece8f..d2ec54299f 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -168,6 +168,29 @@ paths:
               schema:
                 $ref: "#/components/schemas/GenericError"
 
+  /terminate:
+    post:
+      tags:
+      - Terminate
+      summary: Terminate Postgres and wait for it to exit
+      description: ""
+      operationId: terminate
+      responses:
+        200:
+          description: Result
+        412:
+          description: "wrong state"
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+        500:
+          description: "Unexpected error"
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
 components:
   securitySchemes:
     JWT:
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index f1fe12e05f..ce8f035dfc 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -652,7 +652,9 @@ impl Endpoint {
                         }
                         ComputeStatus::Empty
                         | ComputeStatus::ConfigurationPending
-                        | ComputeStatus::Configuration => {
+                        | ComputeStatus::Configuration
+                        | ComputeStatus::TerminationPending
+                        | ComputeStatus::Terminated => {
                             bail!("unexpected compute status: {:?}", state.status)
                         }
                     }
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 92bbf79cd4..fd0c90d447 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -52,6 +52,10 @@ pub enum ComputeStatus {
     // compute will exit soon or is waiting for
     // control-plane to terminate it.
     Failed,
+    // Termination requested
+    TerminationPending,
+    // Terminated Postgres
+    Terminated,
 }
 
 fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>

From 95a184e9b7fd7ef9dba4cf46c3baf0fad5541982 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 11 Apr 2024 21:55:05 +0100
Subject: [PATCH 123/209] proxy: fix overloaded db connection closure (#7364)

## Problem

possible for the database connections to not close in time.

## Summary of changes

force the closing of connections if the client has hung up
---
 proxy/src/serverless/conn_pool.rs | 36 +++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 35311facb8..0a89c56b6f 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -16,6 +16,7 @@ use std::{
 use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
+use tokio_util::sync::CancellationToken;
 
 use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL};
@@ -469,15 +470,32 @@ pub fn poll_client<C: ClientInnerExt>(
 
     let db_user = conn_info.db_and_user();
     let idle = global_pool.get_idle_timeout();
+    let cancel = CancellationToken::new();
+    let cancelled = cancel.clone().cancelled_owned();
+
     tokio::spawn(
     async move {
         let _conn_gauge = conn_gauge;
         let mut idle_timeout = pin!(tokio::time::sleep(idle));
+        let mut cancelled = pin!(cancelled);
+
         poll_fn(move |cx| {
-            if matches!(rx.has_changed(), Ok(true)) {
-                session_id = *rx.borrow_and_update();
-                info!(%session_id, "changed session");
-                idle_timeout.as_mut().reset(Instant::now() + idle);
+            if cancelled.as_mut().poll(cx).is_ready() {
+                info!("connection dropped");
+                return Poll::Ready(())
+            }
+
+            match rx.has_changed() {
+                Ok(true) => {
+                    session_id = *rx.borrow_and_update();
+                    info!(%session_id, "changed session");
+                    idle_timeout.as_mut().reset(Instant::now() + idle);
+                }
+                Err(_) => {
+                    info!("connection dropped");
+                    return Poll::Ready(())
+                }
+                _ => {}
             }
 
             // 5 minute idle connection timeout
@@ -532,6 +550,7 @@ pub fn poll_client<C: ClientInnerExt>(
     let inner = ClientInner {
         inner: client,
         session: tx,
+        cancel,
         aux,
         conn_id,
     };
@@ -541,10 +560,18 @@ pub fn poll_client<C: ClientInnerExt>(
 struct ClientInner<C: ClientInnerExt> {
     inner: C,
     session: tokio::sync::watch::Sender<uuid::Uuid>,
+    cancel: CancellationToken,
     aux: MetricsAuxInfo,
     conn_id: uuid::Uuid,
 }
 
+impl<C: ClientInnerExt> Drop for ClientInner<C> {
+    fn drop(&mut self) {
+        // on client drop, tell the conn to shut down
+        self.cancel.cancel();
+    }
+}
+
 pub trait ClientInnerExt: Sync + Send + 'static {
     fn is_closed(&self) -> bool;
     fn get_process_id(&self) -> i32;
@@ -697,6 +724,7 @@ mod tests {
         ClientInner {
             inner: client,
             session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
+            cancel: CancellationToken::new(),
             aux: MetricsAuxInfo {
                 endpoint_id: (&EndpointId::from("endpoint")).into(),
                 project_id: (&ProjectId::from("project")).into(),

From e07f68923856d943d76a71525bcb9476539e8976 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 25 Apr 2024 13:16:27 +0200
Subject: [PATCH 124/209] Update connect to compute and wake compute retry
 configs (#7509)

## Problem

## Summary of changes

Decrease waiting time
---
 proxy/src/config.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index ae7606e5d4..a32ab8c43c 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -533,13 +533,13 @@ pub struct RetryConfig {
 impl RetryConfig {
     /// Default options for RetryConfig.
 
-    /// Total delay for 4 retries with 1s base delay and 2.0 backoff factor is 7s.
+    /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
     pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
-        "num_retries=4,base_retry_wait_duration=1s,retry_wait_exponent_base=2.0";
-    /// Total delay for 4 retries with 1s base delay and 2.0 backoff factor is 7s.
-    /// Cplane has timeout of 60s on each request.
+        "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6";
+    /// Total delay for 8 retries with 100ms base delay and 1.6 backoff factor is about 7s.
+    /// Cplane has timeout of 60s on each request. 8m7s in total.
     pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str =
-        "num_retries=4,base_retry_wait_duration=1s,retry_wait_exponent_base=2.0";
+        "num_retries=8,base_retry_wait_duration=100ms,retry_wait_exponent_base=1.6";
 
     /// Parse retry options passed via cmdline.
     /// Example: [`Self::CONNECT_TO_COMPUTE_DEFAULT_VALUES`].

From a42d173e7bd02884dbf88bfdc7549253deb74085 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 25 Apr 2024 13:38:51 +0200
Subject: [PATCH 125/209] proxy: Fix cancellations (#7510)

## Problem

Cancellations were published to the channel, that was never read.

## Summary of changes

Fallback to global redis publishing.
---
 proxy/src/bin/proxy.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 760ccf40d4..a1b4c21947 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -339,7 +339,7 @@ async fn main() -> anyhow::Result<()> {
 
     let cancel_map = CancelMap::default();
 
-    let redis_publisher = match &regional_redis_client {
+    let redis_publisher = match &redis_notifications_client {
         Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
             redis_publisher.clone(),
             args.region.clone(),

From 99595813bbff694640e92c7a7b0e94f5d5e0a713 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 2 May 2024 11:50:11 +0200
Subject: [PATCH 126/209] proxy: keep track on the number of events from redis
 by type. (#7582)

## Problem

It's unclear what is the distribution of messages, proxy is consuming
from redis.

## Summary of changes

Add counter.
---
 proxy/src/cache/endpoints.rs     | 14 +++++++++++++-
 proxy/src/metrics.rs             | 14 ++++++++++++++
 proxy/src/redis/notifications.rs | 17 ++++++++++++++++-
 3 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 02511e6ff7..4bc10a6020 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -21,7 +21,7 @@ use crate::{
     config::EndpointCacheConfig,
     context::RequestMonitoring,
     intern::{BranchIdInt, EndpointIdInt, ProjectIdInt},
-    metrics::{Metrics, RedisErrors},
+    metrics::{Metrics, RedisErrors, RedisEventsCount},
     rate_limiter::GlobalRateLimiter,
     redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
     EndpointId,
@@ -100,14 +100,26 @@ impl EndpointsCache {
         if let Some(endpoint_created) = key.endpoint_created {
             self.endpoints
                 .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
+            Metrics::get()
+                .proxy
+                .redis_events_count
+                .inc(RedisEventsCount::EndpointCreated);
         }
         if let Some(branch_created) = key.branch_created {
             self.branches
                 .insert(BranchIdInt::from(&branch_created.branch_id.into()));
+            Metrics::get()
+                .proxy
+                .redis_events_count
+                .inc(RedisEventsCount::BranchCreated);
         }
         if let Some(project_created) = key.project_created {
             self.projects
                 .insert(ProjectIdInt::from(&project_created.project_id.into()));
+            Metrics::get()
+                .proxy
+                .redis_events_count
+                .inc(RedisEventsCount::ProjectCreated);
         }
     }
     pub async fn do_read(
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index c129ece059..4a54857012 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -123,6 +123,9 @@ pub struct ProxyMetrics {
     /// Number of retries (per outcome, per retry_type).
     #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))]
     pub retries_metric: HistogramVec<RetriesMetricSet, 9>,
+
+    /// Number of events consumed from redis (per event type).
+    pub redis_events_count: CounterVec<StaticLabelSet<RedisEventsCount>>,
 }
 
 #[derive(MetricGroup)]
@@ -530,3 +533,14 @@ pub enum RetryType {
     WakeCompute,
     ConnectToCompute,
 }
+
+#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
+#[label(singleton = "event")]
+pub enum RedisEventsCount {
+    EndpointCreated,
+    BranchCreated,
+    ProjectCreated,
+    CancelSession,
+    PasswordUpdate,
+    AllowedIpsUpdate,
+}
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 5a38530faf..ba4dfb755e 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -11,7 +11,7 @@ use crate::{
     cache::project_info::ProjectInfoCache,
     cancellation::{CancelMap, CancellationHandler},
     intern::{ProjectIdInt, RoleNameInt},
-    metrics::{Metrics, RedisErrors},
+    metrics::{Metrics, RedisErrors, RedisEventsCount},
 };
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
@@ -118,6 +118,10 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                     "session_id",
                     &tracing::field::display(cancel_session.session_id),
                 );
+                Metrics::get()
+                    .proxy
+                    .redis_events_count
+                    .inc(RedisEventsCount::CancelSession);
                 if let Some(cancel_region) = cancel_session.region_id {
                     // If the message is not for this region, ignore it.
                     if cancel_region != self.region_id {
@@ -138,6 +142,17 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
             }
             _ => {
                 invalidate_cache(self.cache.clone(), msg.clone());
+                if matches!(msg, AllowedIpsUpdate { .. }) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::AllowedIpsUpdate);
+                } else if matches!(msg, PasswordUpdate { .. }) {
+                    Metrics::get()
+                        .proxy
+                        .redis_events_count
+                        .inc(RedisEventsCount::PasswordUpdate);
+                }
                 // It might happen that the invalid entry is on the way to be cached.
                 // To make sure that the entry is invalidated, let's repeat the invalidation in INVALIDATION_LAG seconds.
                 // TODO: include the version (or the timestamp) in the message and invalidate only if the entry is cached before the message.

From 03c2c569be7a0e379743d99884c2990d484c67f2 Mon Sep 17 00:00:00 2001
From: Anna Khanova <32508607+khanova@users.noreply.github.com>
Date: Thu, 23 May 2024 11:41:29 +0200
Subject: [PATCH 127/209] [proxy] Do not fail after parquet upload error
 (#7858)

## Problem

If the parquet upload was unsuccessful, it will panic.

## Summary of changes

Write error in logs instead.
---
 proxy/src/context/parquet.rs | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 392821c430..a213a32ca4 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -355,7 +355,7 @@ async fn upload_parquet(
         "{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet"
     ))?;
     let cancel = CancellationToken::new();
-    backoff::retry(
+    let maybe_err = backoff::retry(
         || async {
             let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
             storage
@@ -372,7 +372,12 @@ async fn upload_parquet(
     .await
     .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
     .and_then(|x| x)
-    .context("request_data_upload")?;
+    .context("request_data_upload")
+    .err();
+
+    if let Some(err) = maybe_err {
+        tracing::warn!(%id, %err, "failed to upload request data");
+    }
 
     Ok(buffer.writer())
 }

From d81ef3f962672cde16ed76c5a8f2a21cff9ad801 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conradludgate@gmail.com>
Date: Thu, 27 Jun 2024 09:46:58 +0100
Subject: [PATCH 128/209] Revert "proxy: update tokio-postgres to allow
 arbitrary config params (#8076)"

This reverts commit 78d9059fc7490e9c9374e80e04507a88861bd89a.
---
 Cargo.lock                            |   8 +-
 libs/postgres_connection/src/lib.rs   |  50 +++++-----
 proxy/src/compute.rs                  | 129 ++++++++++++--------------
 proxy/src/serverless/backend.rs       |   4 -
 proxy/src/serverless/sql_over_http.rs |   1 -
 test_runner/regress/test_proxy.py     |  19 ----
 6 files changed, 92 insertions(+), 119 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5393538c59..6b9a169ebf 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4020,7 +4020,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4033,7 +4033,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -4052,7 +4052,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -6225,7 +6225,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#cff6927e4f58b1af6ecc2ee7279df1f2ff537295"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/libs/postgres_connection/src/lib.rs b/libs/postgres_connection/src/lib.rs
index fdabcbacb2..9f57f3d507 100644
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -144,7 +144,20 @@ impl PgConnectionConfig {
             // implement and this function is hardly a bottleneck. The function is only called around
             // establishing a new connection.
             #[allow(unstable_name_collisions)]
-            config.options(&encode_options(&self.options));
+            config.options(
+                &self
+                    .options
+                    .iter()
+                    .map(|s| {
+                        if s.contains(['\\', ' ']) {
+                            Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
+                        } else {
+                            Cow::Borrowed(s.as_str())
+                        }
+                    })
+                    .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
+                    .collect::<String>(),
+            );
         }
         config
     }
@@ -165,21 +178,6 @@ impl PgConnectionConfig {
     }
 }
 
-#[allow(unstable_name_collisions)]
-fn encode_options(options: &[String]) -> String {
-    options
-        .iter()
-        .map(|s| {
-            if s.contains(['\\', ' ']) {
-                Cow::Owned(s.replace('\\', "\\\\").replace(' ', "\\ "))
-            } else {
-                Cow::Borrowed(s.as_str())
-            }
-        })
-        .intersperse(Cow::Borrowed(" ")) // TODO: use impl from std once it's stabilized
-        .collect::<String>()
-}
-
 impl fmt::Display for PgConnectionConfig {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         // The password is intentionally hidden and not part of this display string.
@@ -208,7 +206,7 @@ impl fmt::Debug for PgConnectionConfig {
 
 #[cfg(test)]
 mod tests_pg_connection_config {
-    use crate::{encode_options, PgConnectionConfig};
+    use crate::PgConnectionConfig;
     use once_cell::sync::Lazy;
     use url::Host;
 
@@ -257,12 +255,18 @@ mod tests_pg_connection_config {
 
     #[test]
     fn test_with_options() {
-        let options = encode_options(&[
-            "hello".to_owned(),
-            "world".to_owned(),
-            "with space".to_owned(),
-            "and \\ backslashes".to_owned(),
+        let cfg = PgConnectionConfig::new_host_port(STUB_HOST.clone(), 123).extend_options([
+            "hello",
+            "world",
+            "with space",
+            "and \\ backslashes",
         ]);
-        assert_eq!(options, "hello world with\\ space and\\ \\\\\\ backslashes");
+        assert_eq!(cfg.host(), &*STUB_HOST);
+        assert_eq!(cfg.port(), 123);
+        assert_eq!(cfg.raw_address(), "stub.host.example:123");
+        assert_eq!(
+            cfg.to_tokio_postgres_config().get_options(),
+            Some("hello world with\\ space and\\ \\\\\\ backslashes")
+        );
     }
 }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index a50a96e5e8..feb09d5638 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -103,8 +103,12 @@ impl ConnCfg {
 
     /// Reuse password or auth keys from the other config.
     pub fn reuse_password(&mut self, other: Self) {
-        if let Some(password) = other.get_auth() {
-            self.auth(password);
+        if let Some(password) = other.get_password() {
+            self.password(password);
+        }
+
+        if let Some(keys) = other.get_auth_keys() {
+            self.auth_keys(keys);
         }
     }
 
@@ -120,64 +124,48 @@ impl ConnCfg {
 
     /// Apply startup message params to the connection config.
     pub fn set_startup_params(&mut self, params: &StartupMessageParams) {
-        let mut client_encoding = false;
-        for (k, v) in params.iter() {
-            match k {
-                "user" => {
-                    // Only set `user` if it's not present in the config.
-                    // Link auth flow takes username from the console's response.
-                    if self.get_user().is_none() {
-                        self.user(v);
-                    }
+        // Only set `user` if it's not present in the config.
+        // Link auth flow takes username from the console's response.
+        if let (None, Some(user)) = (self.get_user(), params.get("user")) {
+            self.user(user);
+        }
+
+        // Only set `dbname` if it's not present in the config.
+        // Link auth flow takes dbname from the console's response.
+        if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
+            self.dbname(dbname);
+        }
+
+        // Don't add `options` if they were only used for specifying a project.
+        // Connection pools don't support `options`, because they affect backend startup.
+        if let Some(options) = filtered_options(params) {
+            self.options(&options);
+        }
+
+        if let Some(app_name) = params.get("application_name") {
+            self.application_name(app_name);
+        }
+
+        // TODO: This is especially ugly...
+        if let Some(replication) = params.get("replication") {
+            use tokio_postgres::config::ReplicationMode;
+            match replication {
+                "true" | "on" | "yes" | "1" => {
+                    self.replication_mode(ReplicationMode::Physical);
                 }
                 "database" => {
-                    // Only set `dbname` if it's not present in the config.
-                    // Link auth flow takes dbname from the console's response.
-                    if self.get_dbname().is_none() {
-                        self.dbname(v);
-                    }
-                }
-                "options" => {
-                    // Don't add `options` if they were only used for specifying a project.
-                    // Connection pools don't support `options`, because they affect backend startup.
-                    if let Some(options) = filtered_options(v) {
-                        self.options(&options);
-                    }
-                }
-
-                // the special ones in tokio-postgres that we don't want being set by the user
-                "dbname" => {}
-                "password" => {}
-                "sslmode" => {}
-                "host" => {}
-                "port" => {}
-                "connect_timeout" => {}
-                "keepalives" => {}
-                "keepalives_idle" => {}
-                "keepalives_interval" => {}
-                "keepalives_retries" => {}
-                "target_session_attrs" => {}
-                "channel_binding" => {}
-                "max_backend_message_size" => {}
-
-                "client_encoding" => {
-                    client_encoding = true;
-                    // only error should be from bad null bytes,
-                    // but we've already checked for those.
-                    _ = self.param("client_encoding", v);
-                }
-
-                _ => {
-                    // only error should be from bad null bytes,
-                    // but we've already checked for those.
-                    _ = self.param(k, v);
+                    self.replication_mode(ReplicationMode::Logical);
                 }
+                _other => {}
             }
         }
-        if !client_encoding {
-            // for compatibility since we removed it from tokio-postgres
-            self.param("client_encoding", "UTF8").unwrap();
-        }
+
+        // TODO: extend the list of the forwarded startup parameters.
+        // Currently, tokio-postgres doesn't allow us to pass
+        // arbitrary parameters, but the ones above are a good start.
+        //
+        // This and the reverse params problem can be better addressed
+        // in a bespoke connection machinery (a new library for that sake).
     }
 }
 
@@ -350,9 +338,10 @@ impl ConnCfg {
 }
 
 /// Retrieve `options` from a startup message, dropping all proxy-secific flags.
-fn filtered_options(options: &str) -> Option<String> {
+fn filtered_options(params: &StartupMessageParams) -> Option<String> {
     #[allow(unstable_name_collisions)]
-    let options: String = StartupMessageParams::parse_options_raw(options)
+    let options: String = params
+        .options_raw()?
         .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
         .intersperse(" ") // TODO: use impl from std once it's stabilized
         .collect();
@@ -424,23 +413,27 @@ mod tests {
     #[test]
     fn test_filtered_options() {
         // Empty options is unlikely to be useful anyway.
-        assert_eq!(filtered_options(""), None);
+        let params = StartupMessageParams::new([("options", "")]);
+        assert_eq!(filtered_options(&params), None);
 
         // It's likely that clients will only use options to specify endpoint/project.
-        let params = "project=foo";
-        assert_eq!(filtered_options(params), None);
+        let params = StartupMessageParams::new([("options", "project=foo")]);
+        assert_eq!(filtered_options(&params), None);
 
         // Same, because unescaped whitespaces are no-op.
-        let params = " project=foo ";
-        assert_eq!(filtered_options(params), None);
+        let params = StartupMessageParams::new([("options", " project=foo ")]);
+        assert_eq!(filtered_options(&params).as_deref(), None);
 
-        let params = r"\  project=foo \ ";
-        assert_eq!(filtered_options(params).as_deref(), Some(r"\  \ "));
+        let params = StartupMessageParams::new([("options", r"\  project=foo \ ")]);
+        assert_eq!(filtered_options(&params).as_deref(), Some(r"\  \ "));
 
-        let params = "project = foo";
-        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
+        let params = StartupMessageParams::new([("options", "project = foo")]);
+        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
 
-        let params = "project = foo neon_endpoint_type:read_write   neon_lsn:0/2";
-        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
+        let params = StartupMessageParams::new([(
+            "options",
+            "project = foo neon_endpoint_type:read_write   neon_lsn:0/2",
+        )]);
+        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
     }
 }
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 6c34d48338..c73ea8646a 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -236,10 +236,6 @@ impl ConnectMechanism for TokioMechanism {
             .dbname(&self.conn_info.dbname)
             .connect_timeout(timeout);
 
-        config
-            .param("client_encoding", "UTF8")
-            .expect("client encoding UTF8 is always valid");
-
         let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
         let res = config.connect(tokio_postgres::NoTls).await;
         drop(pause);
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 583ff75f7c..7a99aeb759 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -202,7 +202,6 @@ fn get_conn_info(
             options = Some(NeonOptions::parse_options_raw(&value));
         }
     }
-    ctx.set_db_options(params.freeze());
 
     let user_info = ComputeUserInfo {
         endpoint,
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 8ed44b1094..f446f4f200 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -53,25 +53,6 @@ def test_proxy_select_1(static_proxy: NeonProxy):
     assert out[0][0] == 42
 
 
-def test_proxy_server_params(static_proxy: NeonProxy):
-    """
-    Test that server params are passing through to postgres
-    """
-
-    out = static_proxy.safe_psql(
-        "select to_json('0 seconds'::interval)", options="-c intervalstyle=iso_8601"
-    )
-    assert out[0][0] == "PT0S"
-    out = static_proxy.safe_psql(
-        "select to_json('0 seconds'::interval)", options="-c intervalstyle=sql_standard"
-    )
-    assert out[0][0] == "0"
-    out = static_proxy.safe_psql(
-        "select to_json('0 seconds'::interval)", options="-c intervalstyle=postgres"
-    )
-    assert out[0][0] == "00:00:00"
-
-
 def test_password_hack(static_proxy: NeonProxy):
     """
     Check the PasswordHack auth flow: an alternative to SCRAM auth for

From 76077e1ddf9c0c67263c12871a561d38aae3a763 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 21 Nov 2024 06:02:11 +0000
Subject: [PATCH 129/209] Proxy release 2024-11-21


From 9052c32b46bc0606bf720fc04ddd4489c6e98dd2 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 28 Nov 2024 06:02:15 +0000
Subject: [PATCH 130/209] Proxy release 2024-11-28


From 268bc890ea309db615d0be7e6ef22f5d938dfa4f Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Thu, 28 Nov 2024 08:32:22 +0200
Subject: [PATCH 131/209] proxy: spawn cancellation checks in the background
 (#9918)

## Problem
For cancellation, a connection is open during all the cancel checks.
## Summary of changes
Spawn cancellation checks in the background, and close connection
immediately.
Use task_tracker for cancellation checks.
---
 proxy/src/cancellation.rs           | 15 ++++++++-----
 proxy/src/console_redirect_proxy.rs | 35 +++++++++++++++++++++--------
 proxy/src/proxy/mod.rs              | 35 +++++++++++++++++++++--------
 proxy/src/redis/notifications.rs    |  2 +-
 proxy/src/serverless/mod.rs         |  9 ++++++++
 proxy/src/serverless/websocket.rs   |  3 +++
 6 files changed, 75 insertions(+), 24 deletions(-)

diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 74415f1ffe..91e198bf88 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -99,16 +99,17 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
     /// Try to cancel a running query for the corresponding connection.
     /// If the cancellation key is not found, it will be published to Redis.
     /// check_allowed - if true, check if the IP is allowed to cancel the query
+    /// return Result primarily for tests
     pub(crate) async fn cancel_session(
         &self,
         key: CancelKeyData,
         session_id: Uuid,
-        peer_addr: &IpAddr,
+        peer_addr: IpAddr,
         check_allowed: bool,
     ) -> Result<(), CancelError> {
         // TODO: check for unspecified address is only for backward compatibility, should be removed
         if !peer_addr.is_unspecified() {
-            let subnet_key = match *peer_addr {
+            let subnet_key = match peer_addr {
                 IpAddr::V4(ip) => IpNet::V4(Ipv4Net::new_assert(ip, 24).trunc()), // use defaut mask here
                 IpAddr::V6(ip) => IpNet::V6(Ipv6Net::new_assert(ip, 64).trunc()),
             };
@@ -141,9 +142,11 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
                 return Ok(());
             }
 
-            match self.client.try_publish(key, session_id, *peer_addr).await {
+            match self.client.try_publish(key, session_id, peer_addr).await {
                 Ok(()) => {} // do nothing
                 Err(e) => {
+                    // log it here since cancel_session could be spawned in a task
+                    tracing::error!("failed to publish cancellation key: {key}, error: {e}");
                     return Err(CancelError::IO(std::io::Error::new(
                         std::io::ErrorKind::Other,
                         e.to_string(),
@@ -154,8 +157,10 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
         };
 
         if check_allowed
-            && !check_peer_addr_is_in_list(peer_addr, cancel_closure.ip_allowlist.as_slice())
+            && !check_peer_addr_is_in_list(&peer_addr, cancel_closure.ip_allowlist.as_slice())
         {
+            // log it here since cancel_session could be spawned in a task
+            tracing::warn!("IP is not allowed to cancel the query: {key}");
             return Err(CancelError::IpNotAllowed);
         }
 
@@ -306,7 +311,7 @@ mod tests {
                     cancel_key: 0,
                 },
                 Uuid::new_v4(),
-                &("127.0.0.1".parse().unwrap()),
+                "127.0.0.1".parse().unwrap(),
                 true,
             )
             .await
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index b910b524b1..8f78df1964 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -35,6 +35,7 @@ pub async fn task_main(
     socket2::SockRef::from(&listener).set_keepalive(true)?;
 
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
+    let cancellations = tokio_util::task::task_tracker::TaskTracker::new();
 
     while let Some(accept_result) =
         run_until_cancelled(listener.accept(), &cancellation_token).await
@@ -48,6 +49,7 @@ pub async fn task_main(
 
         let session_id = uuid::Uuid::new_v4();
         let cancellation_handler = Arc::clone(&cancellation_handler);
+        let cancellations = cancellations.clone();
 
         debug!(protocol = "tcp", %session_id, "accepted new TCP connection");
 
@@ -96,6 +98,7 @@ pub async fn task_main(
                 cancellation_handler,
                 socket,
                 conn_gauge,
+                cancellations,
             )
             .instrument(ctx.span())
             .boxed()
@@ -127,10 +130,12 @@ pub async fn task_main(
     }
 
     connections.close();
+    cancellations.close();
     drop(listener);
 
     // Drain connections
     connections.wait().await;
+    cancellations.wait().await;
 
     Ok(())
 }
@@ -142,6 +147,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     cancellation_handler: Arc<CancellationHandlerMain>,
     stream: S,
     conn_gauge: NumClientConnectionsGuard<'static>,
+    cancellations: tokio_util::task::task_tracker::TaskTracker,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
     debug!(
         protocol = %ctx.protocol(),
@@ -161,15 +167,26 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(cancel_key_data) => {
-                return Ok(cancellation_handler
-                    .cancel_session(
-                        cancel_key_data,
-                        ctx.session_id(),
-                        &ctx.peer_addr(),
-                        config.authentication_config.ip_allowlist_check_enabled,
-                    )
-                    .await
-                    .map(|()| None)?)
+                // spawn a task to cancel the session, but don't wait for it
+                cancellations.spawn({
+                    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+                    let session_id = ctx.session_id();
+                    let peer_ip = ctx.peer_addr();
+                    async move {
+                        drop(
+                            cancellation_handler_clone
+                                .cancel_session(
+                                    cancel_key_data,
+                                    session_id,
+                                    peer_ip,
+                                    config.authentication_config.ip_allowlist_check_enabled,
+                                )
+                                .await,
+                        );
+                    }
+                });
+
+                return Ok(None);
             }
         };
     drop(pause);
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 7fe67e43de..956036d29d 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -69,6 +69,7 @@ pub async fn task_main(
     socket2::SockRef::from(&listener).set_keepalive(true)?;
 
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
+    let cancellations = tokio_util::task::task_tracker::TaskTracker::new();
 
     while let Some(accept_result) =
         run_until_cancelled(listener.accept(), &cancellation_token).await
@@ -82,6 +83,7 @@ pub async fn task_main(
 
         let session_id = uuid::Uuid::new_v4();
         let cancellation_handler = Arc::clone(&cancellation_handler);
+        let cancellations = cancellations.clone();
 
         debug!(protocol = "tcp", %session_id, "accepted new TCP connection");
         let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();
@@ -133,6 +135,7 @@ pub async fn task_main(
                 ClientMode::Tcp,
                 endpoint_rate_limiter2,
                 conn_gauge,
+                cancellations,
             )
             .instrument(ctx.span())
             .boxed()
@@ -164,10 +167,12 @@ pub async fn task_main(
     }
 
     connections.close();
+    cancellations.close();
     drop(listener);
 
     // Drain connections
     connections.wait().await;
+    cancellations.wait().await;
 
     Ok(())
 }
@@ -250,6 +255,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     mode: ClientMode,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     conn_gauge: NumClientConnectionsGuard<'static>,
+    cancellations: tokio_util::task::task_tracker::TaskTracker,
 ) -> Result<Option<ProxyPassthrough<CancellationHandlerMainInternal, S>>, ClientRequestError> {
     debug!(
         protocol = %ctx.protocol(),
@@ -270,15 +276,26 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         match tokio::time::timeout(config.handshake_timeout, do_handshake).await?? {
             HandshakeData::Startup(stream, params) => (stream, params),
             HandshakeData::Cancel(cancel_key_data) => {
-                return Ok(cancellation_handler
-                    .cancel_session(
-                        cancel_key_data,
-                        ctx.session_id(),
-                        &ctx.peer_addr(),
-                        config.authentication_config.ip_allowlist_check_enabled,
-                    )
-                    .await
-                    .map(|()| None)?)
+                // spawn a task to cancel the session, but don't wait for it
+                cancellations.spawn({
+                    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
+                    let session_id = ctx.session_id();
+                    let peer_ip = ctx.peer_addr();
+                    async move {
+                        drop(
+                            cancellation_handler_clone
+                                .cancel_session(
+                                    cancel_key_data,
+                                    session_id,
+                                    peer_ip,
+                                    config.authentication_config.ip_allowlist_check_enabled,
+                                )
+                                .await,
+                        );
+                    }
+                });
+
+                return Ok(None);
             }
         };
     drop(pause);
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 65008ae943..9ac07b7e90 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -149,7 +149,7 @@ impl<C: ProjectInfoCache + Send + Sync + 'static> MessageHandler<C> {
                     .cancel_session(
                         cancel_session.cancel_key_data,
                         uuid::Uuid::nil(),
-                        &peer_addr,
+                        peer_addr,
                         cancel_session.peer_addr.is_some(),
                     )
                     .await
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 77025f419d..80b42f9e55 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -132,6 +132,7 @@ pub async fn task_main(
     let connections = tokio_util::task::task_tracker::TaskTracker::new();
     connections.close(); // allows `connections.wait to complete`
 
+    let cancellations = tokio_util::task::task_tracker::TaskTracker::new();
     while let Some(res) = run_until_cancelled(ws_listener.accept(), &cancellation_token).await {
         let (conn, peer_addr) = res.context("could not accept TCP stream")?;
         if let Err(e) = conn.set_nodelay(true) {
@@ -160,6 +161,7 @@ pub async fn task_main(
         let connections2 = connections.clone();
         let cancellation_handler = cancellation_handler.clone();
         let endpoint_rate_limiter = endpoint_rate_limiter.clone();
+        let cancellations = cancellations.clone();
         connections.spawn(
             async move {
                 let conn_token2 = conn_token.clone();
@@ -188,6 +190,7 @@ pub async fn task_main(
                     config,
                     backend,
                     connections2,
+                    cancellations,
                     cancellation_handler,
                     endpoint_rate_limiter,
                     conn_token,
@@ -313,6 +316,7 @@ async fn connection_handler(
     config: &'static ProxyConfig,
     backend: Arc<PoolingBackend>,
     connections: TaskTracker,
+    cancellations: TaskTracker,
     cancellation_handler: Arc<CancellationHandlerMain>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_token: CancellationToken,
@@ -353,6 +357,7 @@ async fn connection_handler(
 
             // `request_handler` is not cancel safe. It expects to be cancelled only at specific times.
             // By spawning the future, we ensure it never gets cancelled until it decides to.
+            let cancellations = cancellations.clone();
             let handler = connections.spawn(
                 request_handler(
                     req,
@@ -364,6 +369,7 @@ async fn connection_handler(
                     conn_info2.clone(),
                     http_request_token,
                     endpoint_rate_limiter.clone(),
+                    cancellations,
                 )
                 .in_current_span()
                 .map_ok_or_else(api_error_into_response, |r| r),
@@ -411,6 +417,7 @@ async fn request_handler(
     // used to cancel in-flight HTTP requests. not used to cancel websockets
     http_cancellation_token: CancellationToken,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
+    cancellations: TaskTracker,
 ) -> Result<Response<BoxBody<Bytes, hyper::Error>>, ApiError> {
     let host = request
         .headers()
@@ -436,6 +443,7 @@ async fn request_handler(
         let (response, websocket) = framed_websockets::upgrade::upgrade(&mut request)
             .map_err(|e| ApiError::BadRequest(e.into()))?;
 
+        let cancellations = cancellations.clone();
         ws_connections.spawn(
             async move {
                 if let Err(e) = websocket::serve_websocket(
@@ -446,6 +454,7 @@ async fn request_handler(
                     cancellation_handler,
                     endpoint_rate_limiter,
                     host,
+                    cancellations,
                 )
                 .await
                 {
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 4088fea835..bdb83fe6be 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -123,6 +123,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
     }
 }
 
+#[allow(clippy::too_many_arguments)]
 pub(crate) async fn serve_websocket(
     config: &'static ProxyConfig,
     auth_backend: &'static crate::auth::Backend<'static, ()>,
@@ -131,6 +132,7 @@ pub(crate) async fn serve_websocket(
     cancellation_handler: Arc<CancellationHandlerMain>,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     hostname: Option<String>,
+    cancellations: tokio_util::task::task_tracker::TaskTracker,
 ) -> anyhow::Result<()> {
     let websocket = websocket.await?;
     let websocket = WebSocketServer::after_handshake(TokioIo::new(websocket));
@@ -149,6 +151,7 @@ pub(crate) async fn serve_websocket(
         ClientMode::Websockets { hostname },
         endpoint_rate_limiter,
         conn_gauge,
+        cancellations,
     ))
     .await;
 

From 03d90bc0b353dcbdaa1adc226e47369f4d680276 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 28 Nov 2024 10:11:08 +0000
Subject: [PATCH 132/209] remote_storage/abs: count 404 and 304 for get as ok
 for metrics (#9912)

## Problem

We currently see elevated levels of errors for GetBlob requests. This is
because 404 and 304 are counted as errors for metric reporting.

## Summary of Changes

Bring the implementation in line with the S3 client and treat 404 and
304 responses as ok for metric purposes.

Related: https://github.com/neondatabase/cloud/issues/20666
---
 libs/remote_storage/src/azure_blob.rs | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index ae0a94295c..840917ef68 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -220,6 +220,11 @@ impl AzureBlobStorage {
         let started_at = ScopeGuard::into_inner(started_at);
         let outcome = match &download {
             Ok(_) => AttemptOutcome::Ok,
+            // At this level in the stack 404 and 304 responses do not indicate an error.
+            // There's expected cases when a blob may not exist or hasn't been modified since
+            // the last get (e.g. probing for timeline indices and heatmap downloads).
+            // Callers should handle errors if they are unexpected.
+            Err(DownloadError::NotFound | DownloadError::Unmodified) => AttemptOutcome::Ok,
             Err(_) => AttemptOutcome::Err,
         };
         crate::metrics::BUCKET_METRICS

From 2fa461b66887323889a3ee1cf538aa57cb17c49e Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 28 Nov 2024 16:48:18 +0100
Subject: [PATCH 133/209] Makefile: build pg_visibility (#9922)

Build the `pg_visibility` extension for use with `neon_local`. This is
useful to inspect the visibility map for debugging.

Touches #9914.
---
 Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile b/Makefile
index dc67b87239..9cffc74508 100644
--- a/Makefile
+++ b/Makefile
@@ -147,6 +147,8 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_prewarm install
 	+@echo "Compiling pg_buffercache $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
+	+@echo "Compiling pg_visibility $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install
 	+@echo "Compiling pageinspect $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
 	+@echo "Compiling amcheck $*"

From e401f666983e59e4763f97b921cbc1a614af6c44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 28 Nov 2024 16:49:30 +0100
Subject: [PATCH 134/209] Update rust to 1.83.0, also update cargo adjacent
 tools (#9926)

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

[Release notes](https://releases.rs/docs/1.83.0/).

Also update `cargo-hakari`, `cargo-deny`, `cargo-hack` and
`cargo-nextest` to their latest versions.

Prior update was in #9445.
---
 build-tools.Dockerfile | 18 +++++++++---------
 rust-toolchain.toml    |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 4f491afec5..2671702697 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -57,9 +57,9 @@ RUN mkdir -p /pgcopydb/bin && \
     mkdir -p /pgcopydb/lib && \
     chmod -R 755 /pgcopydb && \
     chown -R nonroot:nonroot /pgcopydb
-        
-COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb 
-COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5 
+
+COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb
+COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5
 
 # System deps
 #
@@ -258,14 +258,14 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.82.0
+ENV RUSTC_VERSION=1.83.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
-ARG CARGO_HAKARI_VERSION=0.9.30
-ARG CARGO_DENY_VERSION=0.16.1
-ARG CARGO_HACK_VERSION=0.6.31
-ARG CARGO_NEXTEST_VERSION=0.9.72
+ARG CARGO_HAKARI_VERSION=0.9.33
+ARG CARGO_DENY_VERSION=0.16.2
+ARG CARGO_HACK_VERSION=0.6.33
+ARG CARGO_NEXTEST_VERSION=0.9.85
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
 	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
@@ -289,7 +289,7 @@ RUN whoami \
     && cargo --version --verbose \
     && rustup --version --verbose \
     && rustc --version --verbose \
-    && clang --version 
+    && clang --version
 
 RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
     LD_LIBRARY_PATH=/pgcopydb/lib /pgcopydb/bin/pgcopydb --version; \
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 92b7929c7f..f0661a32e0 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.82.0"
+channel = "1.83.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From e16439400dc07612883b603c4f51b3357220f6f5 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 28 Nov 2024 17:38:47 +0000
Subject: [PATCH 135/209] pageserver: return correct LSN for interpreted proto
 keep alive responses (#9928)

## Problem

For the interpreted proto the pageserver is not returning the correct
LSN
in replies to keep alive requests. This is because the interpreted
protocol arm
was not updating `last_rec_lsn`.

## Summary of changes

* Return correct LSN in keep-alive responses
* Fix shard field in wal sender traces
---
 .../tenant/timeline/walreceiver/walreceiver_connection.rs    | 4 ++++
 safekeeper/src/handler.rs                                    | 5 +++--
 safekeeper/src/wal_service.rs                                | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 31cf1b6307..d90ffbfa2c 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -454,6 +454,10 @@ pub(super) async fn handle_walreceiver_connection(
                     timeline.get_last_record_lsn()
                 );
 
+                if let Some(lsn) = next_record_lsn {
+                    last_rec_lsn = lsn;
+                }
+
                 Some(streaming_lsn)
             }
 
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 22f33b17e0..8dd2929a03 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -212,8 +212,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                 );
 
             if let Some(shard) = self.shard.as_ref() {
-                tracing::Span::current()
-                    .record("shard", tracing::field::display(shard.shard_slug()));
+                if let Some(slug) = shard.shard_slug().strip_prefix("-") {
+                    tracing::Span::current().record("shard", tracing::field::display(slug));
+                }
             }
 
             Ok(())
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 1ab54d4cce..5248d545db 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -44,7 +44,7 @@ pub async fn task_main(
                     error!("connection handler exited: {}", err);
                 }
             }
-            .instrument(info_span!("", cid = %conn_id, ttid = field::Empty, application_name = field::Empty)),
+            .instrument(info_span!("", cid = %conn_id, ttid = field::Empty, application_name = field::Empty, shard = field::Empty)),
         );
     }
 }

From 1fb6ab59e82fd7f603b1a406474cb8092589c659 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 28 Nov 2024 19:02:57 +0000
Subject: [PATCH 136/209] test_runner: rerun all failed tests (#9917)

## Problem

Currently, we rerun only known flaky tests. This approach was chosen to
reduce the number of tests that go unnoticed (by forcing people to take
a look at failed tests and rerun the job manually), but it has some
drawbacks:
- In PRs, people tend to push new changes without checking failed tests
(that's ok)
- In the main, tests are just restarted without checking
(understandable)
- Parametrised tests become flaky one by one, i.e. if `test[1]` is flaky
`, test[2]` is not marked as flaky automatically (which may or may not
be the case).

I suggest rerunning all failed tests to increase the stability of GitHub
jobs and using the Grafana Dashboard with flaky tests for deeper
analysis.

## Summary of changes
- Rerun all failed tests twice at max
---
 .../actions/run-python-test-set/action.yml    |  17 +-
 .github/workflows/_build-and-test-locally.yml |   2 +-
 poetry.lock                                   |  12 +-
 pyproject.toml                                |   2 +-
 scripts/flaky_tests.py                        | 147 ------------------
 test_runner/conftest.py                       |   2 +-
 test_runner/fixtures/flaky.py                 |  78 ----------
 test_runner/fixtures/paths.py                 |   2 +-
 test_runner/fixtures/reruns.py                |  31 ++++
 9 files changed, 46 insertions(+), 247 deletions(-)
 delete mode 100755 scripts/flaky_tests.py
 delete mode 100644 test_runner/fixtures/flaky.py
 create mode 100644 test_runner/fixtures/reruns.py

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 275f161019..1159627302 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -36,8 +36,8 @@ inputs:
     description: 'Region name for real s3 tests'
     required: false
     default: ''
-  rerun_flaky:
-    description: 'Whether to rerun flaky tests'
+  rerun_failed:
+    description: 'Whether to rerun failed tests'
     required: false
     default: 'false'
   pg_version:
@@ -108,7 +108,7 @@ runs:
         COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
         ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
         ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
-        RERUN_FLAKY: ${{ inputs.rerun_flaky }}
+        RERUN_FAILED: ${{ inputs.rerun_failed }}
         PG_VERSION: ${{ inputs.pg_version }}
       shell: bash -euxo pipefail {0}
       run: |
@@ -154,15 +154,8 @@ runs:
           EXTRA_PARAMS="--out-dir $PERF_REPORT_DIR $EXTRA_PARAMS"
         fi
 
-        if [ "${RERUN_FLAKY}" == "true" ]; then
-          mkdir -p $TEST_OUTPUT
-          poetry run ./scripts/flaky_tests.py "${TEST_RESULT_CONNSTR}" \
-                                              --days 7 \
-                                              --output "$TEST_OUTPUT/flaky.json" \
-                                              --pg-version "${DEFAULT_PG_VERSION}" \
-                                              --build-type "${BUILD_TYPE}"
-
-          EXTRA_PARAMS="--flaky-tests-json $TEST_OUTPUT/flaky.json $EXTRA_PARAMS"
+        if [ "${RERUN_FAILED}" == "true" ]; then
+          EXTRA_PARAMS="--reruns 2 $EXTRA_PARAMS"
         fi
 
         # We use pytest-split plugin to run benchmarks in parallel on different CI runners
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index bdf7c07c6a..42c32a23e3 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -293,7 +293,7 @@ jobs:
           run_with_real_s3: true
           real_s3_bucket: neon-github-ci-tests
           real_s3_region: eu-central-1
-          rerun_flaky: true
+          rerun_failed: true
           pg_version: ${{ matrix.pg_version }}
         env:
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
diff --git a/poetry.lock b/poetry.lock
index e2fca7be47..59ae5cf1ca 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2563,18 +2563,18 @@ pytest = "*"
 
 [[package]]
 name = "pytest-rerunfailures"
-version = "13.0"
+version = "15.0"
 description = "pytest plugin to re-run tests to eliminate flaky failures"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.9"
 files = [
-    {file = "pytest-rerunfailures-13.0.tar.gz", hash = "sha256:e132dbe420bc476f544b96e7036edd0a69707574209b6677263c950d19b09199"},
-    {file = "pytest_rerunfailures-13.0-py3-none-any.whl", hash = "sha256:34919cb3fcb1f8e5d4b940aa75ccdea9661bade925091873b7c6fa5548333069"},
+    {file = "pytest-rerunfailures-15.0.tar.gz", hash = "sha256:2d9ac7baf59f4c13ac730b47f6fa80e755d1ba0581da45ce30b72fb3542b4474"},
+    {file = "pytest_rerunfailures-15.0-py3-none-any.whl", hash = "sha256:dd150c4795c229ef44320adc9a0c0532c51b78bb7a6843a8c53556b9a611df1a"},
 ]
 
 [package.dependencies]
 packaging = ">=17.1"
-pytest = ">=7"
+pytest = ">=7.4,<8.2.2 || >8.2.2"
 
 [[package]]
 name = "pytest-split"
@@ -3524,4 +3524,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "21debe1116843e5d14bdf37d6e265c68c63a98a64ba04ec8b8a02af2e8d9f486"
+content-hash = "426c385df93f578ba3537c40a269535e27fbcca1978b3cf266096ecbc298c6a9"
diff --git a/pyproject.toml b/pyproject.toml
index ccd3ab1864..01d15ee6bb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,7 +33,7 @@ types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
 aiohttp = "3.10.11"
-pytest-rerunfailures = "^13.0"
+pytest-rerunfailures = "^15.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"
 zstandard = "^0.21.0"
diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
deleted file mode 100755
index 3fb668ed2d..0000000000
--- a/scripts/flaky_tests.py
+++ /dev/null
@@ -1,147 +0,0 @@
-#! /usr/bin/env python3
-
-from __future__ import annotations
-
-import argparse
-import json
-import logging
-import os
-from collections import defaultdict
-from typing import TYPE_CHECKING
-
-import psycopg2
-import psycopg2.extras
-import toml
-
-if TYPE_CHECKING:
-    from typing import Any
-
-FLAKY_TESTS_QUERY = """
-    SELECT
-        DISTINCT parent_suite, suite, name
-    FROM results
-    WHERE
-        started_at > CURRENT_DATE - INTERVAL '%s' day
-        AND (
-            (status IN ('failed', 'broken') AND reference = 'refs/heads/main')
-            OR flaky
-        )
-    ;
-"""
-
-
-def main(args: argparse.Namespace):
-    connstr = args.connstr
-    interval_days = args.days
-    output = args.output
-
-    build_type = args.build_type
-    pg_version = args.pg_version
-
-    res: defaultdict[str, defaultdict[str, dict[str, bool]]]
-    res = defaultdict(lambda: defaultdict(dict))
-
-    try:
-        logging.info("connecting to the database...")
-        with psycopg2.connect(connstr, connect_timeout=30) as conn:
-            with conn.cursor(cursor_factory=psycopg2.extras.DictCursor) as cur:
-                logging.info("fetching flaky tests...")
-                cur.execute(FLAKY_TESTS_QUERY, (interval_days,))
-                rows = cur.fetchall()
-    except psycopg2.OperationalError as exc:
-        logging.error("cannot fetch flaky tests from the DB due to an error", exc)
-        rows = []
-
-    # If a test run has non-default PAGESERVER_VIRTUAL_FILE_IO_ENGINE (i.e. not empty, not tokio-epoll-uring),
-    # use it to parametrize test name along with build_type and pg_version
-    #
-    # See test_runner/fixtures/parametrize.py for details
-    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in (
-        "",
-        "tokio-epoll-uring",
-    ):
-        pageserver_virtual_file_io_engine_parameter = f"-{io_engine}"
-    else:
-        pageserver_virtual_file_io_engine_parameter = ""
-
-    # re-use existing records of flaky tests from before parametrization by compaction_algorithm
-    def get_pageserver_default_tenant_config_compaction_algorithm() -> dict[str, Any] | None:
-        """Duplicated from parametrize.py"""
-        toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
-        if toml_table is None:
-            return None
-        v = toml.loads(toml_table)
-        assert isinstance(v, dict)
-        return v
-
-    pageserver_default_tenant_config_compaction_algorithm_parameter = ""
-    if (
-        explicit_default := get_pageserver_default_tenant_config_compaction_algorithm()
-    ) is not None:
-        pageserver_default_tenant_config_compaction_algorithm_parameter = (
-            f"-{explicit_default['kind']}"
-        )
-
-    for row in rows:
-        # We don't want to automatically rerun tests in a performance suite
-        if row["parent_suite"] != "test_runner.regress":
-            continue
-
-        if row["name"].endswith("]"):
-            parametrized_test = row["name"].replace(
-                "[",
-                f"[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}-",
-            )
-        else:
-            parametrized_test = f"{row['name']}[{build_type}-pg{pg_version}{pageserver_virtual_file_io_engine_parameter}{pageserver_default_tenant_config_compaction_algorithm_parameter}]"
-
-        res[row["parent_suite"]][row["suite"]][parametrized_test] = True
-
-        logging.info(
-            f"\t{row['parent_suite'].replace('.', '/')}/{row['suite']}.py::{parametrized_test}"
-        )
-
-    logging.info(f"saving results to {output.name}")
-    json.dump(res, output, indent=2)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Detect flaky tests in the last N days")
-    parser.add_argument(
-        "--output",
-        type=argparse.FileType("w"),
-        default="flaky.json",
-        help="path to output json file (default: flaky.json)",
-    )
-    parser.add_argument(
-        "--days",
-        required=False,
-        default=10,
-        type=int,
-        help="how many days to look back for flaky tests (default: 10)",
-    )
-    parser.add_argument(
-        "--build-type",
-        required=True,
-        type=str,
-        help="for which build type to create list of flaky tests (debug or release)",
-    )
-    parser.add_argument(
-        "--pg-version",
-        required=True,
-        type=int,
-        help="for which Postgres version to create list of flaky tests (14, 15, etc.)",
-    )
-    parser.add_argument(
-        "connstr",
-        help="connection string to the test results database",
-    )
-    args = parser.parse_args()
-
-    level = logging.INFO
-    logging.basicConfig(
-        format="%(message)s",
-        level=level,
-    )
-
-    main(args)
diff --git a/test_runner/conftest.py b/test_runner/conftest.py
index 84eda52d33..887bfef478 100644
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -13,5 +13,5 @@ pytest_plugins = (
     "fixtures.pg_stats",
     "fixtures.compare_fixtures",
     "fixtures.slow",
-    "fixtures.flaky",
+    "fixtures.reruns",
 )
diff --git a/test_runner/fixtures/flaky.py b/test_runner/fixtures/flaky.py
deleted file mode 100644
index 01634a29c5..0000000000
--- a/test_runner/fixtures/flaky.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from __future__ import annotations
-
-import json
-from collections.abc import MutableMapping
-from pathlib import Path
-from typing import TYPE_CHECKING, cast
-
-import pytest
-from _pytest.config import Config
-from _pytest.config.argparsing import Parser
-from allure_commons.types import LabelType
-from allure_pytest.utils import allure_name, allure_suite_labels
-
-from fixtures.log_helper import log
-
-if TYPE_CHECKING:
-    from collections.abc import MutableMapping
-    from typing import Any
-
-
-"""
-The plugin reruns flaky tests.
-It uses `pytest.mark.flaky` provided by `pytest-rerunfailures` plugin and flaky tests detected by `scripts/flaky_tests.py`
-
-Note: the logic of getting flaky tests is extracted to a separate script to avoid running it for each of N xdist workers
-"""
-
-
-def pytest_addoption(parser: Parser):
-    parser.addoption(
-        "--flaky-tests-json",
-        action="store",
-        type=Path,
-        help="Path to json file with flaky tests generated by scripts/flaky_tests.py",
-    )
-
-
-def pytest_collection_modifyitems(config: Config, items: list[pytest.Item]):
-    if not config.getoption("--flaky-tests-json"):
-        return
-
-    # Any error with getting flaky tests aren't critical, so just do not rerun any tests
-    flaky_json = config.getoption("--flaky-tests-json")
-    if not flaky_json.exists():
-        return
-
-    content = flaky_json.read_text()
-    try:
-        flaky_tests = json.loads(content)
-    except ValueError:
-        log.error(f"Can't parse {content} as json")
-        return
-
-    for item in items:
-        # Use the same logic for constructing test name as Allure does (we store allure-provided data in DB)
-        # Ref https://github.com/allure-framework/allure-python/blob/2.13.1/allure-pytest/src/listener.py#L98-L100
-        allure_labels = dict(allure_suite_labels(item))
-        parent_suite = str(allure_labels.get(LabelType.PARENT_SUITE))
-        suite = str(allure_labels.get(LabelType.SUITE))
-        params = item.callspec.params if hasattr(item, "callspec") else {}
-        name = allure_name(item, params)
-
-        if flaky_tests.get(parent_suite, {}).get(suite, {}).get(name, False):
-            # Rerun 3 times = 1 original run + 2 reruns
-            log.info(f"Marking {item.nodeid} as flaky. It will be rerun up to 3 times")
-            item.add_marker(pytest.mark.flaky(reruns=2))
-
-            # pytest-rerunfailures is not compatible with pytest-timeout (timeout is not set for reruns),
-            #   we can workaround it by setting `timeout_func_only` to True[1].
-            # Unfortunately, setting `timeout_func_only = True` globally in pytest.ini is broken[2],
-            #   but we still can do it using pytest marker.
-            #
-            # - [1] https://github.com/pytest-dev/pytest-rerunfailures/issues/99
-            # - [2] https://github.com/pytest-dev/pytest-timeout/issues/142
-            timeout_marker = item.get_closest_marker("timeout")
-            if timeout_marker is not None:
-                kwargs = cast("MutableMapping[str, Any]", timeout_marker.kwargs)
-                kwargs["func_only"] = True
diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py
index 1c71abea19..80777d65e9 100644
--- a/test_runner/fixtures/paths.py
+++ b/test_runner/fixtures/paths.py
@@ -30,7 +30,7 @@ def get_test_dir(request: FixtureRequest, top_output_dir: Path, prefix: str | No
     test_name = request.node.name
     test_dir = top_output_dir / f"{prefix or ''}{test_name.replace('/', '-')}"
 
-    # We rerun flaky tests multiple times, use a separate directory for each run.
+    # We rerun failed tests multiple times, use a separate directory for each run.
     if (suffix := getattr(request.node, "execution_count", None)) is not None:
         test_dir = test_dir.parent / f"{test_dir.name}-{suffix}"
 
diff --git a/test_runner/fixtures/reruns.py b/test_runner/fixtures/reruns.py
new file mode 100644
index 0000000000..f2a25ae8f6
--- /dev/null
+++ b/test_runner/fixtures/reruns.py
@@ -0,0 +1,31 @@
+from __future__ import annotations
+
+from collections.abc import MutableMapping
+from typing import TYPE_CHECKING, cast
+
+import pytest
+
+if TYPE_CHECKING:
+    from collections.abc import MutableMapping
+    from typing import Any
+
+    from _pytest.config import Config
+
+
+def pytest_collection_modifyitems(config: Config, items: list[pytest.Item]):
+    # pytest-rerunfailures is not compatible with pytest-timeout (timeout is not set for reruns),
+    #   we can workaround it by setting `timeout_func_only` to True[1].
+    # Unfortunately, setting `timeout_func_only = True` globally in pytest.ini is broken[2],
+    #   but we still can do it using pytest marker.
+    #
+    # - [1] https://github.com/pytest-dev/pytest-rerunfailures/issues/99
+    # - [2] https://github.com/pytest-dev/pytest-timeout/issues/142
+
+    if not config.getoption("--reruns"):
+        return
+
+    for item in items:
+        timeout_marker = item.get_closest_marker("timeout")
+        if timeout_marker is not None:
+            kwargs = cast("MutableMapping[str, Any]", timeout_marker.kwargs)
+            kwargs["func_only"] = True

From b0822a54991825e450bcbf450599707d476ddc4b Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 28 Nov 2024 22:38:30 +0100
Subject: [PATCH 137/209] fix(compute_ctl): Allow usage of DB names with
 whitespaces (#9919)

## Problem

We used `set_path()` to replace the database name in the connection
string. It automatically does url-safe encoding if the path is not
already encoded, but it does it as per the URL standard, which assumes
that tabs can be safely removed from the path without changing the
meaning of the URL. See, e.g.,
https://url.spec.whatwg.org/#concept-basic-url-parser. It also breaks
for DBs with properly %-encoded names, like with `%20`, as they are kept
intact, but actually should be escaped.

Yet, this is not true for Postgres, where it's completely valid to have
trailing tabs in the database name.

I think this is the PR that caused this regression
https://github.com/neondatabase/neon/pull/9717, as it switched from
`postgres::config::Config` back to `set_path()`.

This was fixed a while ago already [1], btw, I just haven't added a test
to catch this regression back then :(

## Summary of changes

This commit changes the code back to use
`postgres/tokio_postgres::Config` everywhere.

While on it, also do some changes around, as I had to touch this code:
1. Bump some logging from `debug` to `info` in the spec apply path. We
do not use `debug` in prod, and it was tricky to understand what was
going on with this bug in prod.
2. Refactor configuration concurrency calculation code so it was
reusable. Yet, still keep `1` in the case of reconfiguration. The
database can be actively used at this moment, so we cannot guarantee
that there will be enough spare connection slots, and the underlying
code won't handle connection errors properly.
3. Simplify the installed extensions code. It was spawning a blocking
task inside async function, which doesn't make much sense. Instead, just
have a main sync function and call it with `spawn_blocking` in the API
code -- the only place we need it to be async.
4. Add regression python test to cover this and related problems in the
future. Also, add more extensive testing of schema dump and DBs and
roles listing API.

[1]:
https://github.com/neondatabase/neon/commit/4d1e48f3b9a4b7064787513fd2c455f0001f6e18
[2]:
https://www.postgresql.org/message-id/flat/20151023003445.931.91267%40wrigleys.postgresql.org

Resolves neondatabase/cloud#20869
---
 compute_tools/src/catalog.rs                |  39 ++++-
 compute_tools/src/compute.rs                | 153 +++++++++++---------
 compute_tools/src/http/api.rs               |   7 +-
 compute_tools/src/installed_extensions.rs   | 105 +++++---------
 compute_tools/src/pg_helpers.rs             |  11 ++
 test_runner/fixtures/endpoint/http.py       |   6 +-
 test_runner/fixtures/neon_fixtures.py       |  29 ++++
 test_runner/regress/test_compute_catalog.py | 111 +++++++++++++-
 8 files changed, 318 insertions(+), 143 deletions(-)

diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs
index 2f6f82dd39..08ae8bf44d 100644
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -1,4 +1,3 @@
-use compute_api::responses::CatalogObjects;
 use futures::Stream;
 use postgres::NoTls;
 use std::{path::Path, process::Stdio, result::Result, sync::Arc};
@@ -13,7 +12,8 @@ use tokio_util::codec::{BytesCodec, FramedRead};
 use tracing::warn;
 
 use crate::compute::ComputeNode;
-use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async};
+use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async, postgres_conf_for_db};
+use compute_api::responses::CatalogObjects;
 
 pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
     let connstr = compute.connstr.clone();
@@ -43,6 +43,8 @@ pub enum SchemaDumpError {
     DatabaseDoesNotExist,
     #[error("Failed to execute pg_dump.")]
     IO(#[from] std::io::Error),
+    #[error("Unexpected error.")]
+    Unexpected,
 }
 
 // It uses the pg_dump utility to dump the schema of the specified database.
@@ -60,11 +62,38 @@ pub async fn get_database_schema(
     let pgbin = &compute.pgbin;
     let basepath = Path::new(pgbin).parent().unwrap();
     let pgdump = basepath.join("pg_dump");
-    let mut connstr = compute.connstr.clone();
-    connstr.set_path(dbname);
+
+    // Replace the DB in the connection string and disable it to parts.
+    // This is the only option to handle DBs with special characters.
+    let conf =
+        postgres_conf_for_db(&compute.connstr, dbname).map_err(|_| SchemaDumpError::Unexpected)?;
+    let host = conf
+        .get_hosts()
+        .first()
+        .ok_or(SchemaDumpError::Unexpected)?;
+    let host = match host {
+        tokio_postgres::config::Host::Tcp(ip) => ip.to_string(),
+        #[cfg(unix)]
+        tokio_postgres::config::Host::Unix(path) => path.to_string_lossy().to_string(),
+    };
+    let port = conf
+        .get_ports()
+        .first()
+        .ok_or(SchemaDumpError::Unexpected)?;
+    let user = conf.get_user().ok_or(SchemaDumpError::Unexpected)?;
+    let dbname = conf.get_dbname().ok_or(SchemaDumpError::Unexpected)?;
+
     let mut cmd = Command::new(pgdump)
+        // XXX: this seems to be the only option to deal with DBs with `=` in the name
+        // See <https://www.postgresql.org/message-id/flat/20151023003445.931.91267%40wrigleys.postgresql.org>
+        .env("PGDATABASE", dbname)
+        .arg("--host")
+        .arg(host)
+        .arg("--port")
+        .arg(port.to_string())
+        .arg("--username")
+        .arg(user)
         .arg("--schema-only")
-        .arg(connstr.as_str())
         .stdout(Stdio::piped())
         .stderr(Stdio::piped())
         .kill_on_drop(true)
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 4f67425ba8..1a026a4014 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -34,9 +34,8 @@ use utils::measured_stream::MeasuredReader;
 use nix::sys::signal::{kill, Signal};
 use remote_storage::{DownloadError, RemotePath};
 use tokio::spawn;
-use url::Url;
 
-use crate::installed_extensions::get_installed_extensions_sync;
+use crate::installed_extensions::get_installed_extensions;
 use crate::local_proxy;
 use crate::pg_helpers::*;
 use crate::spec::*;
@@ -816,30 +815,32 @@ impl ComputeNode {
         Ok(())
     }
 
-    async fn get_maintenance_client(url: &Url) -> Result<tokio_postgres::Client> {
-        let mut connstr = url.clone();
+    async fn get_maintenance_client(
+        conf: &tokio_postgres::Config,
+    ) -> Result<tokio_postgres::Client> {
+        let mut conf = conf.clone();
 
-        connstr
-            .query_pairs_mut()
-            .append_pair("application_name", "apply_config");
+        conf.application_name("apply_config");
 
-        let (client, conn) = match tokio_postgres::connect(connstr.as_str(), NoTls).await {
+        let (client, conn) = match conf.connect(NoTls).await {
+            // If connection fails, it may be the old node with `zenith_admin` superuser.
+            //
+            // In this case we need to connect with old `zenith_admin` name
+            // and create new user. We cannot simply rename connected user,
+            // but we can create a new one and grant it all privileges.
             Err(e) => match e.code() {
                 Some(&SqlState::INVALID_PASSWORD)
                 | Some(&SqlState::INVALID_AUTHORIZATION_SPECIFICATION) => {
-                    // connect with zenith_admin if cloud_admin could not authenticate
+                    // Connect with zenith_admin if cloud_admin could not authenticate
                     info!(
                         "cannot connect to postgres: {}, retrying with `zenith_admin` username",
                         e
                     );
-                    let mut zenith_admin_connstr = connstr.clone();
-
-                    zenith_admin_connstr
-                        .set_username("zenith_admin")
-                        .map_err(|_| anyhow::anyhow!("invalid connstr"))?;
+                    let mut zenith_admin_conf = postgres::config::Config::from(conf.clone());
+                    zenith_admin_conf.user("zenith_admin");
 
                     let mut client =
-                        Client::connect(zenith_admin_connstr.as_str(), NoTls)
+                        zenith_admin_conf.connect(NoTls)
                             .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
 
                     // Disable forwarding so that users don't get a cloud_admin role
@@ -853,8 +854,8 @@ impl ComputeNode {
 
                     drop(client);
 
-                    // reconnect with connstring with expected name
-                    tokio_postgres::connect(connstr.as_str(), NoTls).await?
+                    // Reconnect with connstring with expected name
+                    conf.connect(NoTls).await?
                 }
                 _ => return Err(e.into()),
             },
@@ -885,7 +886,7 @@ impl ComputeNode {
     pub fn apply_spec_sql(
         &self,
         spec: Arc<ComputeSpec>,
-        url: Arc<Url>,
+        conf: Arc<tokio_postgres::Config>,
         concurrency: usize,
     ) -> Result<()> {
         let rt = tokio::runtime::Builder::new_multi_thread()
@@ -897,7 +898,7 @@ impl ComputeNode {
 
         rt.block_on(async {
             // Proceed with post-startup configuration. Note, that order of operations is important.
-            let client = Self::get_maintenance_client(&url).await?;
+            let client = Self::get_maintenance_client(&conf).await?;
             let spec = spec.clone();
 
             let databases = get_existing_dbs_async(&client).await?;
@@ -931,7 +932,7 @@ impl ComputeNode {
                 RenameAndDeleteDatabases,
                 CreateAndAlterDatabases,
             ] {
-                debug!("Applying phase {:?}", &phase);
+                info!("Applying phase {:?}", &phase);
                 apply_operations(
                     spec.clone(),
                     ctx.clone(),
@@ -942,6 +943,7 @@ impl ComputeNode {
                 .await?;
             }
 
+            info!("Applying RunInEachDatabase phase");
             let concurrency_token = Arc::new(tokio::sync::Semaphore::new(concurrency));
 
             let db_processes = spec
@@ -955,7 +957,7 @@ impl ComputeNode {
                     let spec = spec.clone();
                     let ctx = ctx.clone();
                     let jwks_roles = jwks_roles.clone();
-                    let mut url = url.as_ref().clone();
+                    let mut conf = conf.as_ref().clone();
                     let concurrency_token = concurrency_token.clone();
                     let db = db.clone();
 
@@ -964,14 +966,14 @@ impl ComputeNode {
                     match &db {
                         DB::SystemDB => {}
                         DB::UserDB(db) => {
-                            url.set_path(db.name.as_str());
+                            conf.dbname(db.name.as_str());
                         }
                     }
 
-                    let url = Arc::new(url);
+                    let conf = Arc::new(conf);
                     let fut = Self::apply_spec_sql_db(
                         spec.clone(),
-                        url,
+                        conf,
                         ctx.clone(),
                         jwks_roles.clone(),
                         concurrency_token.clone(),
@@ -1017,7 +1019,7 @@ impl ComputeNode {
     /// semaphore.  The caller has to make sure the semaphore isn't exhausted.
     async fn apply_spec_sql_db(
         spec: Arc<ComputeSpec>,
-        url: Arc<Url>,
+        conf: Arc<tokio_postgres::Config>,
         ctx: Arc<tokio::sync::RwLock<MutableApplyContext>>,
         jwks_roles: Arc<HashSet<String>>,
         concurrency_token: Arc<tokio::sync::Semaphore>,
@@ -1046,7 +1048,7 @@ impl ComputeNode {
                 // that database.
                 || async {
                     if client_conn.is_none() {
-                        let db_client = Self::get_maintenance_client(&url).await?;
+                        let db_client = Self::get_maintenance_client(&conf).await?;
                         client_conn.replace(db_client);
                     }
                     let client = client_conn.as_ref().unwrap();
@@ -1061,34 +1063,16 @@ impl ComputeNode {
         Ok::<(), anyhow::Error>(())
     }
 
-    /// Do initial configuration of the already started Postgres.
-    #[instrument(skip_all)]
-    pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
-        // If connection fails,
-        // it may be the old node with `zenith_admin` superuser.
-        //
-        // In this case we need to connect with old `zenith_admin` name
-        // and create new user. We cannot simply rename connected user,
-        // but we can create a new one and grant it all privileges.
-        let mut url = self.connstr.clone();
-        url.query_pairs_mut()
-            .append_pair("application_name", "apply_config");
-
-        let url = Arc::new(url);
-        let spec = Arc::new(
-            compute_state
-                .pspec
-                .as_ref()
-                .expect("spec must be set")
-                .spec
-                .clone(),
-        );
-
-        // Choose how many concurrent connections to use for applying the spec changes.
-        // If the cluster is not currently Running we don't have to deal with user connections,
+    /// Choose how many concurrent connections to use for applying the spec changes.
+    pub fn max_service_connections(
+        &self,
+        compute_state: &ComputeState,
+        spec: &ComputeSpec,
+    ) -> usize {
+        // If the cluster is in Init state we don't have to deal with user connections,
         // and can thus use all `max_connections` connection slots. However, that's generally not
         // very efficient, so we generally still limit it to a smaller number.
-        let max_concurrent_connections = if compute_state.status != ComputeStatus::Running {
+        if compute_state.status == ComputeStatus::Init {
             // If the settings contain 'max_connections', use that as template
             if let Some(config) = spec.cluster.settings.find("max_connections") {
                 config.parse::<usize>().ok()
@@ -1144,10 +1128,29 @@ impl ComputeNode {
                 .map(|val| if val > 1 { val - 1 } else { 1 })
                 .last()
                 .unwrap_or(3)
-        };
+        }
+    }
+
+    /// Do initial configuration of the already started Postgres.
+    #[instrument(skip_all)]
+    pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
+        let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
+        conf.application_name("apply_config");
+
+        let conf = Arc::new(conf);
+        let spec = Arc::new(
+            compute_state
+                .pspec
+                .as_ref()
+                .expect("spec must be set")
+                .spec
+                .clone(),
+        );
+
+        let max_concurrent_connections = self.max_service_connections(compute_state, &spec);
 
         // Merge-apply spec & changes to PostgreSQL state.
-        self.apply_spec_sql(spec.clone(), url.clone(), max_concurrent_connections)?;
+        self.apply_spec_sql(spec.clone(), conf.clone(), max_concurrent_connections)?;
 
         if let Some(ref local_proxy) = &spec.clone().local_proxy_config {
             info!("configuring local_proxy");
@@ -1156,12 +1159,11 @@ impl ComputeNode {
 
         // Run migrations separately to not hold up cold starts
         thread::spawn(move || {
-            let mut connstr = url.as_ref().clone();
-            connstr
-                .query_pairs_mut()
-                .append_pair("application_name", "migrations");
+            let conf = conf.as_ref().clone();
+            let mut conf = postgres::config::Config::from(conf);
+            conf.application_name("migrations");
 
-            let mut client = Client::connect(connstr.as_str(), NoTls)?;
+            let mut client = conf.connect(NoTls)?;
             handle_migrations(&mut client).context("apply_config handle_migrations")
         });
 
@@ -1222,21 +1224,28 @@ impl ComputeNode {
         let pgdata_path = Path::new(&self.pgdata);
         let postgresql_conf_path = pgdata_path.join("postgresql.conf");
         config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
-        // temporarily reset max_cluster_size in config
+
+        // TODO(ololobus): We need a concurrency during reconfiguration as well,
+        // but DB is already running and used by user. We can easily get out of
+        // `max_connections` limit, and the current code won't handle that.
+        // let compute_state = self.state.lock().unwrap().clone();
+        // let max_concurrent_connections = self.max_service_connections(&compute_state, &spec);
+        let max_concurrent_connections = 1;
+
+        // Temporarily reset max_cluster_size in config
         // to avoid the possibility of hitting the limit, while we are reconfiguring:
-        // creating new extensions, roles, etc...
+        // creating new extensions, roles, etc.
         config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
             self.pg_reload_conf()?;
 
             if spec.mode == ComputeMode::Primary {
-                let mut url = self.connstr.clone();
-                url.query_pairs_mut()
-                    .append_pair("application_name", "apply_config");
-                let url = Arc::new(url);
+                let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
+                conf.application_name("apply_config");
+                let conf = Arc::new(conf);
 
                 let spec = Arc::new(spec.clone());
 
-                self.apply_spec_sql(spec, url, 1)?;
+                self.apply_spec_sql(spec, conf, max_concurrent_connections)?;
             }
 
             Ok(())
@@ -1362,7 +1371,17 @@ impl ComputeNode {
 
             let connstr = self.connstr.clone();
             thread::spawn(move || {
-                get_installed_extensions_sync(connstr).context("get_installed_extensions")
+                let res = get_installed_extensions(&connstr);
+                match res {
+                    Ok(extensions) => {
+                        info!(
+                            "[NEON_EXT_STAT] {}",
+                            serde_json::to_string(&extensions)
+                                .expect("failed to serialize extensions list")
+                        );
+                    }
+                    Err(err) => error!("could not get installed extensions: {err:?}"),
+                }
             });
         }
 
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 8a047634df..a6c6cff20a 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -296,7 +296,12 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             }
 
             let connstr = compute.connstr.clone();
-            let res = crate::installed_extensions::get_installed_extensions(connstr).await;
+            let res = task::spawn_blocking(move || {
+                installed_extensions::get_installed_extensions(&connstr)
+            })
+            .await
+            .unwrap();
+
             match res {
                 Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
                 Err(e) => render_json_error(
diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index 79d8b2ca04..f473c29a55 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -2,17 +2,16 @@ use compute_api::responses::{InstalledExtension, InstalledExtensions};
 use metrics::proto::MetricFamily;
 use std::collections::HashMap;
 use std::collections::HashSet;
-use tracing::info;
-use url::Url;
 
 use anyhow::Result;
 use postgres::{Client, NoTls};
-use tokio::task;
 
 use metrics::core::Collector;
 use metrics::{register_uint_gauge_vec, UIntGaugeVec};
 use once_cell::sync::Lazy;
 
+use crate::pg_helpers::postgres_conf_for_db;
+
 /// We don't reuse get_existing_dbs() just for code clarity
 /// and to make database listing query here more explicit.
 ///
@@ -42,75 +41,51 @@ fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 ///
 /// Same extension can be installed in multiple databases with different versions,
 /// we only keep the highest and lowest version across all databases.
-pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtensions> {
-    let mut connstr = connstr.clone();
+pub fn get_installed_extensions(connstr: &url::Url) -> Result<InstalledExtensions> {
+    let mut client = Client::connect(connstr.as_str(), NoTls)?;
+    let databases: Vec<String> = list_dbs(&mut client)?;
 
-    task::spawn_blocking(move || {
-        let mut client = Client::connect(connstr.as_str(), NoTls)?;
-        let databases: Vec<String> = list_dbs(&mut client)?;
+    let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
+    for db in databases.iter() {
+        let config = postgres_conf_for_db(connstr, db)?;
+        let mut db_client = config.connect(NoTls)?;
+        let extensions: Vec<(String, String)> = db_client
+            .query(
+                "SELECT extname, extversion FROM pg_catalog.pg_extension;",
+                &[],
+            )?
+            .iter()
+            .map(|row| (row.get("extname"), row.get("extversion")))
+            .collect();
 
-        let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
-        for db in databases.iter() {
-            connstr.set_path(db);
-            let mut db_client = Client::connect(connstr.as_str(), NoTls)?;
-            let extensions: Vec<(String, String)> = db_client
-                .query(
-                    "SELECT extname, extversion FROM pg_catalog.pg_extension;",
-                    &[],
-                )?
-                .iter()
-                .map(|row| (row.get("extname"), row.get("extversion")))
-                .collect();
+        for (extname, v) in extensions.iter() {
+            let version = v.to_string();
 
-            for (extname, v) in extensions.iter() {
-                let version = v.to_string();
+            // increment the number of databases where the version of extension is installed
+            INSTALLED_EXTENSIONS
+                .with_label_values(&[extname, &version])
+                .inc();
 
-                // increment the number of databases where the version of extension is installed
-                INSTALLED_EXTENSIONS
-                    .with_label_values(&[extname, &version])
-                    .inc();
-
-                extensions_map
-                    .entry(extname.to_string())
-                    .and_modify(|e| {
-                        e.versions.insert(version.clone());
-                        // count the number of databases where the extension is installed
-                        e.n_databases += 1;
-                    })
-                    .or_insert(InstalledExtension {
-                        extname: extname.to_string(),
-                        versions: HashSet::from([version.clone()]),
-                        n_databases: 1,
-                    });
-            }
+            extensions_map
+                .entry(extname.to_string())
+                .and_modify(|e| {
+                    e.versions.insert(version.clone());
+                    // count the number of databases where the extension is installed
+                    e.n_databases += 1;
+                })
+                .or_insert(InstalledExtension {
+                    extname: extname.to_string(),
+                    versions: HashSet::from([version.clone()]),
+                    n_databases: 1,
+                });
         }
+    }
 
-        let res = InstalledExtensions {
-            extensions: extensions_map.values().cloned().collect(),
-        };
+    let res = InstalledExtensions {
+        extensions: extensions_map.values().cloned().collect(),
+    };
 
-        Ok(res)
-    })
-    .await?
-}
-
-// Gather info about installed extensions
-pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> {
-    let rt = tokio::runtime::Builder::new_current_thread()
-        .enable_all()
-        .build()
-        .expect("failed to create runtime");
-    let result = rt
-        .block_on(crate::installed_extensions::get_installed_extensions(
-            connstr,
-        ))
-        .expect("failed to get installed extensions");
-
-    info!(
-        "[NEON_EXT_STAT] {}",
-        serde_json::to_string(&result).expect("failed to serialize extensions list")
-    );
-    Ok(())
+    Ok(res)
 }
 
 static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 4a1e5ee0e8..e03b410699 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -6,6 +6,7 @@ use std::io::{BufRead, BufReader};
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::Child;
+use std::str::FromStr;
 use std::thread::JoinHandle;
 use std::time::{Duration, Instant};
 
@@ -13,8 +14,10 @@ use anyhow::{bail, Result};
 use futures::StreamExt;
 use ini::Ini;
 use notify::{RecursiveMode, Watcher};
+use postgres::config::Config;
 use tokio::io::AsyncBufReadExt;
 use tokio::time::timeout;
+use tokio_postgres;
 use tokio_postgres::NoTls;
 use tracing::{debug, error, info, instrument};
 
@@ -542,3 +545,11 @@ async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Resu
 
     Ok(())
 }
+
+/// `Postgres::config::Config` handles database names with whitespaces
+/// and special characters properly.
+pub fn postgres_conf_for_db(connstr: &url::Url, dbname: &str) -> Result<Config> {
+    let mut conf = Config::from_str(connstr.as_str())?;
+    conf.dbname(dbname);
+    Ok(conf)
+}
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index db3723b7cc..1cd9158c68 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import urllib.parse
+
 import requests
 from requests.adapters import HTTPAdapter
 
@@ -20,7 +22,9 @@ class EndpointHttpClient(requests.Session):
         return res.json()
 
     def database_schema(self, database: str):
-        res = self.get(f"http://localhost:{self.port}/database_schema?database={database}")
+        res = self.get(
+            f"http://localhost:{self.port}/database_schema?database={urllib.parse.quote(database, safe='')}"
+        )
         res.raise_for_status()
         return res.text
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a45a311dc2..1f4d2aa5ec 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3934,6 +3934,35 @@ class Endpoint(PgProtocol, LogUtils):
             log.info(json.dumps(dict(data_dict, **kwargs)))
             json.dump(dict(data_dict, **kwargs), file, indent=4)
 
+    def respec_deep(self, **kwargs: Any) -> None:
+        """
+        Update the endpoint.json file taking into account nested keys.
+        It does one level deep update. Should enough for most cases.
+        Distinct method from respec() to do not break existing functionality.
+        NOTE: This method also updates the spec.json file, not endpoint.json.
+        We need it because neon_local also writes to spec.json, so intended
+        use-case is i) start endpoint with some config, ii) respec_deep(),
+        iii) call reconfigure() to apply the changes.
+        """
+        config_path = os.path.join(self.endpoint_path(), "spec.json")
+        with open(config_path) as f:
+            data_dict: dict[str, Any] = json.load(f)
+
+        log.info("Current compute spec: %s", json.dumps(data_dict, indent=4))
+
+        for key, value in kwargs.items():
+            if isinstance(value, dict):
+                if key not in data_dict:
+                    data_dict[key] = value
+                else:
+                    data_dict[key] = {**data_dict[key], **value}
+            else:
+                data_dict[key] = value
+
+        with open(config_path, "w") as file:
+            log.info("Updating compute spec to: %s", json.dumps(data_dict, indent=4))
+            json.dump(data_dict, file, indent=4)
+
     # Please note: Migrations only run if pg_skip_catalog_updates is false
     def wait_for_migrations(self, num_migrations: int = 11):
         with self.cursor() as cur:
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
index d43c71ceac..b3719a45ed 100644
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -3,13 +3,60 @@ from __future__ import annotations
 import requests
 from fixtures.neon_fixtures import NeonEnv
 
+TEST_DB_NAMES = [
+    {
+        "name": "neondb",
+        "owner": "cloud_admin",
+    },
+    {
+        "name": "db with spaces",
+        "owner": "cloud_admin",
+    },
+    {
+        "name": "db with%20spaces ",
+        "owner": "cloud_admin",
+    },
+    {
+        "name": "db with whitespaces	",
+        "owner": "cloud_admin",
+    },
+    {
+        "name": "injective db with spaces'; SELECT pg_sleep(10);",
+        "owner": "cloud_admin",
+    },
+    {
+        "name": "db with #pound-sign and &ampersands=true",
+        "owner": "cloud_admin",
+    },
+    {
+        "name": "db with emoji 🌍",
+        "owner": "cloud_admin",
+    },
+]
+
 
 def test_compute_catalog(neon_simple_env: NeonEnv):
+    """
+    Create a bunch of databases with tricky names and test that we can list them
+    and dump via API.
+    """
     env = neon_simple_env
 
-    endpoint = env.endpoints.create_start("main", config_lines=["log_min_messages=debug1"])
-    client = endpoint.http_client()
+    endpoint = env.endpoints.create_start("main")
 
+    # Update the spec.json file to include new databases
+    # and reconfigure the endpoint to create some test databases.
+    endpoint.respec_deep(
+        **{
+            "skip_pg_catalog_updates": False,
+            "cluster": {
+                "databases": TEST_DB_NAMES,
+            },
+        }
+    )
+    endpoint.reconfigure()
+
+    client = endpoint.http_client()
     objects = client.dbs_and_roles()
 
     # Assert that 'cloud_admin' role exists in the 'roles' list
@@ -22,9 +69,24 @@ def test_compute_catalog(neon_simple_env: NeonEnv):
         db["name"] == "postgres" for db in objects["databases"]
     ), "The 'postgres' database is missing"
 
-    ddl = client.database_schema(database="postgres")
+    # Check other databases
+    for test_db in TEST_DB_NAMES:
+        db = next((db for db in objects["databases"] if db["name"] == test_db["name"]), None)
+        assert db is not None, f"The '{test_db['name']}' database is missing"
+        assert (
+            db["owner"] == test_db["owner"]
+        ), f"The '{test_db['name']}' database has incorrect owner"
 
-    assert "-- PostgreSQL database dump" in ddl
+        ddl = client.database_schema(database=test_db["name"])
+
+        # Check that it looks like a valid PostgreSQL dump
+        assert "-- PostgreSQL database dump" in ddl
+
+        # Check that it doesn't contain health_check and migration traces.
+        # They are only created in system `postgres` database, so by checking
+        # that we ensure that we dump right databases.
+        assert "health_check" not in ddl, f"The '{test_db['name']}' database contains health_check"
+        assert "migration" not in ddl, f"The '{test_db['name']}' database contains migrations data"
 
     try:
         client.database_schema(database="nonexistentdb")
@@ -33,3 +95,44 @@ def test_compute_catalog(neon_simple_env: NeonEnv):
         assert (
             e.response.status_code == 404
         ), f"Expected 404 status code, but got {e.response.status_code}"
+
+
+def test_compute_create_databases(neon_simple_env: NeonEnv):
+    """
+    Test that compute_ctl can create and work with databases with special
+    characters (whitespaces, %, tabs, etc.) in the name.
+    """
+    env = neon_simple_env
+
+    # Create and start endpoint so that neon_local put all the generated
+    # stuff into the spec.json file.
+    endpoint = env.endpoints.create_start("main")
+
+    # Update the spec.json file to include new databases
+    # and reconfigure the endpoint to apply the changes.
+    endpoint.respec_deep(
+        **{
+            "skip_pg_catalog_updates": False,
+            "cluster": {
+                "databases": TEST_DB_NAMES,
+            },
+        }
+    )
+    endpoint.reconfigure()
+
+    for db in TEST_DB_NAMES:
+        # Check that database has a correct name in the system catalog
+        with endpoint.cursor() as cursor:
+            cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", (db["name"],))
+            catalog_db = cursor.fetchone()
+            assert catalog_db is not None
+            assert len(catalog_db) == 1
+            assert catalog_db[0] == db["name"]
+
+        # Check that we can connect to this database without any issues
+        with endpoint.cursor(dbname=db["name"]) as cursor:
+            cursor.execute("SELECT * FROM current_database()")
+            curr_db = cursor.fetchone()
+            assert curr_db is not None
+            assert len(curr_db) == 1
+            assert curr_db[0] == db["name"]

From e5152551ad6de0ae8516d2416cd90244d97b3088 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 29 Nov 2024 10:40:08 +0100
Subject: [PATCH 138/209] test_runner/performance: add logical message ingest
 benchmark (#9749)

Adds a benchmark for logical message WAL ingestion throughput
end-to-end. Logical messages are essentially noops, and thus ignored by
the Pageserver.

Example results from my MacBook, with fsync enabled:

```
postgres_ingest: 14.445 s
safekeeper_ingest: 29.948 s
pageserver_ingest: 30.013 s
pageserver_recover_ingest: 8.633 s
wal_written: 10,340 MB
message_count: 1310720 messages
postgres_throughput: 715 MB/s
safekeeper_throughput: 345 MB/s
pageserver_throughput: 344 MB/s
pageserver_recover_throughput: 1197 MB/s
```

See
https://github.com/neondatabase/neon/issues/9642#issuecomment-2475995205
for running analysis.

Touches #9642.
---
 test_runner/fixtures/neon_fixtures.py         |  31 ++++++
 .../test_ingest_logical_message.py            | 101 ++++++++++++++++++
 2 files changed, 132 insertions(+)
 create mode 100644 test_runner/performance/test_ingest_logical_message.py

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 1f4d2aa5ec..e3c88e9965 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4404,6 +4404,10 @@ class Safekeeper(LogUtils):
         log.info(f"sk {self.id} flush LSN: {flush_lsn}")
         return flush_lsn
 
+    def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
+        timeline_status = self.http_client().timeline_status(tenant_id, timeline_id)
+        return timeline_status.commit_lsn
+
     def pull_timeline(
         self, srcs: list[Safekeeper], tenant_id: TenantId, timeline_id: TimelineId
     ) -> dict[str, Any]:
@@ -4949,6 +4953,33 @@ def wait_for_last_flush_lsn(
     return min(results)
 
 
+def wait_for_commit_lsn(
+    env: NeonEnv,
+    tenant: TenantId,
+    timeline: TimelineId,
+    lsn: Lsn,
+) -> Lsn:
+    # TODO: it would be better to poll this in the compute, but there's no API for it. See:
+    # https://github.com/neondatabase/neon/issues/9758
+    "Wait for the given LSN to be committed on any Safekeeper"
+
+    max_commit_lsn = Lsn(0)
+    for i in range(1000):
+        for sk in env.safekeepers:
+            commit_lsn = sk.get_commit_lsn(tenant, timeline)
+            if commit_lsn >= lsn:
+                log.info(f"{tenant}/{timeline} at commit_lsn {commit_lsn}")
+                return commit_lsn
+            max_commit_lsn = max(max_commit_lsn, commit_lsn)
+
+        if i % 10 == 0:
+            log.info(
+                f"{tenant}/{timeline} waiting for commit_lsn to reach {lsn}, now {max_commit_lsn}"
+            )
+        time.sleep(0.1)
+    raise Exception(f"timed out while waiting for commit_lsn to reach {lsn}, was {max_commit_lsn}")
+
+
 def flush_ep_to_pageserver(
     env: NeonEnv,
     ep: Endpoint,
diff --git a/test_runner/performance/test_ingest_logical_message.py b/test_runner/performance/test_ingest_logical_message.py
new file mode 100644
index 0000000000..d3118eb15a
--- /dev/null
+++ b/test_runner/performance/test_ingest_logical_message.py
@@ -0,0 +1,101 @@
+from __future__ import annotations
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.common_types import Lsn
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    wait_for_commit_lsn,
+    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.utils import wait_for_last_record_lsn
+
+
+@pytest.mark.timeout(600)
+@pytest.mark.parametrize("size", [1024, 8192, 131072])
+@pytest.mark.parametrize("fsync", [True, False], ids=["fsync", "nofsync"])
+def test_ingest_logical_message(
+    request: pytest.FixtureRequest,
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    fsync: bool,
+    size: int,
+):
+    """
+    Benchmarks ingestion of 10 GB of logical message WAL. These are essentially noops, and don't
+    incur any pageserver writes.
+    """
+
+    VOLUME = 10 * 1024**3
+    count = VOLUME // size
+
+    neon_env_builder.safekeepers_enable_fsync = fsync
+
+    env = neon_env_builder.init_start()
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            f"fsync = {fsync}",
+            # Disable backpressure. We don't want to block on pageserver.
+            "max_replication_apply_lag = 0",
+            "max_replication_flush_lag = 0",
+            "max_replication_write_lag = 0",
+        ],
+    )
+    client = env.pageserver.http_client()
+
+    # Wait for the timeline to be propagated to the pageserver.
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+
+    # Ingest data and measure durations.
+    start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+
+    with endpoint.cursor() as cur:
+        cur.execute("set statement_timeout = 0")
+
+        # Postgres will return once the logical messages have been written to its local WAL, without
+        # waiting for Safekeeper commit. We measure ingestion time both for Postgres, Safekeeper,
+        # and Pageserver to detect bottlenecks.
+        log.info("Ingesting data")
+        with zenbenchmark.record_duration("pageserver_ingest"):
+            with zenbenchmark.record_duration("safekeeper_ingest"):
+                with zenbenchmark.record_duration("postgres_ingest"):
+                    cur.execute(f"""
+                        select pg_logical_emit_message(false, '', repeat('x', {size}))
+                        from generate_series(1, {count})
+                    """)
+
+                    end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+
+                # Wait for Safekeeper.
+                log.info("Waiting for Safekeeper to catch up")
+                wait_for_commit_lsn(env, env.initial_tenant, env.initial_timeline, end_lsn)
+
+            # Wait for Pageserver.
+            log.info("Waiting for Pageserver to catch up")
+            wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn)
+
+    # Now that all data is ingested, delete and recreate the tenant in the pageserver. This will
+    # reingest all the WAL from the safekeeper without any other constraints. This gives us a
+    # baseline of how fast the pageserver can ingest this WAL in isolation.
+    status = env.storage_controller.inspect(tenant_shard_id=env.initial_tenant)
+    assert status is not None
+
+    client.tenant_delete(env.initial_tenant)
+    env.pageserver.tenant_create(tenant_id=env.initial_tenant, generation=status[0])
+
+    with zenbenchmark.record_duration("pageserver_recover_ingest"):
+        log.info("Recovering WAL into pageserver")
+        client.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline)
+        wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+
+    # Emit metrics.
+    wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))
+    zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
+    zenbenchmark.record("message_count", count, "messages", MetricReport.TEST_PARAM)
+
+    props = {p["name"]: p["value"] for _, p in request.node.user_properties}
+    for name in ("postgres", "safekeeper", "pageserver", "pageserver_recover"):
+        throughput = int(wal_written_mb / props[f"{name}_ingest"])
+        zenbenchmark.record(f"{name}_throughput", throughput, "MB/s", MetricReport.HIGHER_IS_BETTER)

From e61ec94fbc25da4368cd2c6242f66067f5cd85fe Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 29 Nov 2024 11:08:01 +0000
Subject: [PATCH 139/209] chore(proxy): vendor a subset of rust-postgres
 (#9930)

Our rust-postgres fork is getting messy. Mostly because proxy wants more
control over the raw protocol than tokio-postgres provides. As such,
it's diverging more and more. Storage and compute also make use of
rust-postgres, but in more normal usage, thus they don't need our crazy
changes.

Idea:
* proxy maintains their subset
* other teams use a minimal patch set against upstream rust-postgres

Reviewing this code will be difficult. To implement it, I
1. Copied tokio-postgres, postgres-protocol and postgres-types from
https://github.com/neondatabase/rust-postgres/tree/00940fcdb57a8e99e805297b75839e7c4c7b1796
2. Updated their package names with the `2` suffix to make them compile
in the workspace.
3. Updated proxy to use those packages
4. Copied in the code from tokio-postgres-rustls 0.13 (with some patches
applied https://github.com/jbg/tokio-postgres-rustls/pull/32
https://github.com/jbg/tokio-postgres-rustls/pull/33)
5. Removed as much dead code as I could find in the vendored libraries
6. Updated the tokio-postgres-rustls code to use our existing channel
binding implementation
---
 .config/hakari.toml                           |    3 +
 Cargo.lock                                    |   56 +-
 Cargo.toml                                    |    3 +
 libs/proxy/README.md                          |    6 +
 libs/proxy/postgres-protocol2/Cargo.toml      |   21 +
 .../src/authentication/mod.rs                 |   37 +
 .../src/authentication/sasl.rs                |  516 +++++
 .../postgres-protocol2/src/escape/mod.rs      |   93 +
 .../postgres-protocol2/src/escape/test.rs     |   17 +
 libs/proxy/postgres-protocol2/src/lib.rs      |   78 +
 .../postgres-protocol2/src/message/backend.rs |  766 ++++++++
 .../src/message/frontend.rs                   |  297 +++
 .../postgres-protocol2/src/message/mod.rs     |    8 +
 .../postgres-protocol2/src/password/mod.rs    |  107 ++
 .../postgres-protocol2/src/password/test.rs   |   19 +
 .../proxy/postgres-protocol2/src/types/mod.rs |  294 +++
 .../postgres-protocol2/src/types/test.rs      |   87 +
 libs/proxy/postgres-types2/Cargo.toml         |   10 +
 libs/proxy/postgres-types2/src/lib.rs         |  477 +++++
 libs/proxy/postgres-types2/src/private.rs     |   34 +
 libs/proxy/postgres-types2/src/type_gen.rs    | 1524 +++++++++++++++
 libs/proxy/tokio-postgres2/Cargo.toml         |   21 +
 .../proxy/tokio-postgres2/src/cancel_query.rs |   40 +
 .../tokio-postgres2/src/cancel_query_raw.rs   |   29 +
 .../proxy/tokio-postgres2/src/cancel_token.rs |   62 +
 libs/proxy/tokio-postgres2/src/client.rs      |  439 +++++
 libs/proxy/tokio-postgres2/src/codec.rs       |  109 ++
 libs/proxy/tokio-postgres2/src/config.rs      |  897 +++++++++
 libs/proxy/tokio-postgres2/src/connect.rs     |  112 ++
 libs/proxy/tokio-postgres2/src/connect_raw.rs |  359 ++++
 .../tokio-postgres2/src/connect_socket.rs     |   65 +
 libs/proxy/tokio-postgres2/src/connect_tls.rs |   48 +
 libs/proxy/tokio-postgres2/src/connection.rs  |  323 ++++
 libs/proxy/tokio-postgres2/src/error/mod.rs   |  501 +++++
 .../tokio-postgres2/src/error/sqlstate.rs     | 1670 +++++++++++++++++
 .../tokio-postgres2/src/generic_client.rs     |   64 +
 libs/proxy/tokio-postgres2/src/lib.rs         |  148 ++
 .../tokio-postgres2/src/maybe_tls_stream.rs   |   77 +
 libs/proxy/tokio-postgres2/src/prepare.rs     |  262 +++
 libs/proxy/tokio-postgres2/src/query.rs       |  340 ++++
 libs/proxy/tokio-postgres2/src/row.rs         |  300 +++
 .../proxy/tokio-postgres2/src/simple_query.rs |  142 ++
 libs/proxy/tokio-postgres2/src/statement.rs   |  157 ++
 libs/proxy/tokio-postgres2/src/tls.rs         |  162 ++
 .../proxy/tokio-postgres2/src/to_statement.rs |   57 +
 libs/proxy/tokio-postgres2/src/transaction.rs |   74 +
 .../src/transaction_builder.rs                |  113 ++
 libs/proxy/tokio-postgres2/src/types.rs       |    6 +
 proxy/Cargo.toml                              |    6 +-
 proxy/src/compute.rs                          |    5 +-
 proxy/src/context/mod.rs                      |    1 +
 proxy/src/lib.rs                              |    1 +
 proxy/src/postgres_rustls/mod.rs              |  158 ++
 proxy/src/proxy/tests/mod.rs                  |    2 +-
 proxy/src/serverless/backend.rs               |    2 +-
 proxy/src/serverless/conn_pool.rs             |    5 +-
 proxy/src/serverless/local_conn_pool.rs       |   11 +-
 workspace_hack/Cargo.toml                     |    4 +-
 58 files changed, 11199 insertions(+), 26 deletions(-)
 create mode 100644 libs/proxy/README.md
 create mode 100644 libs/proxy/postgres-protocol2/Cargo.toml
 create mode 100644 libs/proxy/postgres-protocol2/src/authentication/mod.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/authentication/sasl.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/escape/mod.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/escape/test.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/lib.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/message/backend.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/message/frontend.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/message/mod.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/password/mod.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/password/test.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/types/mod.rs
 create mode 100644 libs/proxy/postgres-protocol2/src/types/test.rs
 create mode 100644 libs/proxy/postgres-types2/Cargo.toml
 create mode 100644 libs/proxy/postgres-types2/src/lib.rs
 create mode 100644 libs/proxy/postgres-types2/src/private.rs
 create mode 100644 libs/proxy/postgres-types2/src/type_gen.rs
 create mode 100644 libs/proxy/tokio-postgres2/Cargo.toml
 create mode 100644 libs/proxy/tokio-postgres2/src/cancel_query.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/cancel_query_raw.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/cancel_token.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/client.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/codec.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/config.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/connect.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/connect_raw.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/connect_socket.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/connect_tls.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/connection.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/error/mod.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/error/sqlstate.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/generic_client.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/lib.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/prepare.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/query.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/row.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/simple_query.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/statement.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/tls.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/to_statement.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/transaction.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/transaction_builder.rs
 create mode 100644 libs/proxy/tokio-postgres2/src/types.rs
 create mode 100644 proxy/src/postgres_rustls/mod.rs

diff --git a/.config/hakari.toml b/.config/hakari.toml
index b5990d090e..3b6d9d8822 100644
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -46,6 +46,9 @@ workspace-members = [
     "utils",
     "wal_craft",
     "walproposer",
+    "postgres-protocol2",
+    "postgres-types2",
+    "tokio-postgres2",
 ]
 
 # Write out exact versions rather than a semver range. (Defaults to false.)
diff --git a/Cargo.lock b/Cargo.lock
index 43a46fb1eb..f05c6311dd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4162,6 +4162,23 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "postgres-protocol2"
+version = "0.1.0"
+dependencies = [
+ "base64 0.20.0",
+ "byteorder",
+ "bytes",
+ "fallible-iterator",
+ "hmac",
+ "md-5",
+ "memchr",
+ "rand 0.8.5",
+ "sha2",
+ "stringprep",
+ "tokio",
+]
+
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
@@ -4170,8 +4187,15 @@ dependencies = [
  "bytes",
  "fallible-iterator",
  "postgres-protocol",
- "serde",
- "serde_json",
+]
+
+[[package]]
+name = "postgres-types2"
+version = "0.1.0"
+dependencies = [
+ "bytes",
+ "fallible-iterator",
+ "postgres-protocol2",
 ]
 
 [[package]]
@@ -4501,7 +4525,7 @@ dependencies = [
  "parquet_derive",
  "pbkdf2",
  "pin-project-lite",
- "postgres-protocol",
+ "postgres-protocol2",
  "postgres_backend",
  "pq_proto",
  "prometheus",
@@ -4536,8 +4560,7 @@ dependencies = [
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
  "tokio",
- "tokio-postgres",
- "tokio-postgres-rustls",
+ "tokio-postgres2",
  "tokio-rustls 0.26.0",
  "tokio-tungstenite",
  "tokio-util",
@@ -6421,6 +6444,7 @@ dependencies = [
  "libc",
  "mio",
  "num_cpus",
+ "parking_lot 0.12.1",
  "pin-project-lite",
  "signal-hook-registry",
  "socket2",
@@ -6502,6 +6526,26 @@ dependencies = [
  "x509-certificate",
 ]
 
+[[package]]
+name = "tokio-postgres2"
+version = "0.1.0"
+dependencies = [
+ "async-trait",
+ "byteorder",
+ "bytes",
+ "fallible-iterator",
+ "futures-util",
+ "log",
+ "parking_lot 0.12.1",
+ "percent-encoding",
+ "phf",
+ "pin-project-lite",
+ "postgres-protocol2",
+ "postgres-types2",
+ "tokio",
+ "tokio-util",
+]
+
 [[package]]
 name = "tokio-rustls"
 version = "0.24.0"
@@ -7597,7 +7641,6 @@ dependencies = [
  "num-traits",
  "once_cell",
  "parquet",
- "postgres-types",
  "prettyplease",
  "proc-macro2",
  "prost",
@@ -7622,7 +7665,6 @@ dependencies = [
  "time",
  "time-macros",
  "tokio",
- "tokio-postgres",
  "tokio-rustls 0.26.0",
  "tokio-stream",
  "tokio-util",
diff --git a/Cargo.toml b/Cargo.toml
index e3dc5b97f8..742201d0f5 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -35,6 +35,9 @@ members = [
     "libs/walproposer",
     "libs/wal_decoder",
     "libs/postgres_initdb",
+    "libs/proxy/postgres-protocol2",
+    "libs/proxy/postgres-types2",
+    "libs/proxy/tokio-postgres2",
 ]
 
 [workspace.package]
diff --git a/libs/proxy/README.md b/libs/proxy/README.md
new file mode 100644
index 0000000000..2ae6210e46
--- /dev/null
+++ b/libs/proxy/README.md
@@ -0,0 +1,6 @@
+This directory contains libraries that are specific for proxy.
+
+Currently, it contains a signficant fork/refactoring of rust-postgres that no longer reflects the API
+of the original library. Since it was so significant, it made sense to upgrade it to it's own set of libraries.
+
+Proxy needs unique access to the protocol, which explains why such heavy modifications were necessary.
diff --git a/libs/proxy/postgres-protocol2/Cargo.toml b/libs/proxy/postgres-protocol2/Cargo.toml
new file mode 100644
index 0000000000..284a632954
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "postgres-protocol2"
+version = "0.1.0"
+edition = "2018"
+license = "MIT/Apache-2.0"
+
+[dependencies]
+base64 = "0.20"
+byteorder.workspace = true
+bytes.workspace = true
+fallible-iterator.workspace = true
+hmac.workspace = true
+md-5 = "0.10"
+memchr = "2.0"
+rand.workspace = true
+sha2.workspace = true
+stringprep = "0.1"
+tokio = { workspace = true, features = ["rt"] }
+
+[dev-dependencies]
+tokio = { workspace = true, features = ["full"] }
diff --git a/libs/proxy/postgres-protocol2/src/authentication/mod.rs b/libs/proxy/postgres-protocol2/src/authentication/mod.rs
new file mode 100644
index 0000000000..71afa4b9b6
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/authentication/mod.rs
@@ -0,0 +1,37 @@
+//! Authentication protocol support.
+use md5::{Digest, Md5};
+
+pub mod sasl;
+
+/// Hashes authentication information in a way suitable for use in response
+/// to an `AuthenticationMd5Password` message.
+///
+/// The resulting string should be sent back to the database in a
+/// `PasswordMessage` message.
+#[inline]
+pub fn md5_hash(username: &[u8], password: &[u8], salt: [u8; 4]) -> String {
+    let mut md5 = Md5::new();
+    md5.update(password);
+    md5.update(username);
+    let output = md5.finalize_reset();
+    md5.update(format!("{:x}", output));
+    md5.update(salt);
+    format!("md5{:x}", md5.finalize())
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn md5() {
+        let username = b"md5_user";
+        let password = b"password";
+        let salt = [0x2a, 0x3d, 0x8f, 0xe0];
+
+        assert_eq!(
+            md5_hash(username, password, salt),
+            "md562af4dd09bbb41884907a838a3233294"
+        );
+    }
+}
diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
new file mode 100644
index 0000000000..19aa3c1e9a
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
@@ -0,0 +1,516 @@
+//! SASL-based authentication support.
+
+use hmac::{Hmac, Mac};
+use rand::{self, Rng};
+use sha2::digest::FixedOutput;
+use sha2::{Digest, Sha256};
+use std::fmt::Write;
+use std::io;
+use std::iter;
+use std::mem;
+use std::str;
+use tokio::task::yield_now;
+
+const NONCE_LENGTH: usize = 24;
+
+/// The identifier of the SCRAM-SHA-256 SASL authentication mechanism.
+pub const SCRAM_SHA_256: &str = "SCRAM-SHA-256";
+/// The identifier of the SCRAM-SHA-256-PLUS SASL authentication mechanism.
+pub const SCRAM_SHA_256_PLUS: &str = "SCRAM-SHA-256-PLUS";
+
+// since postgres passwords are not required to exclude saslprep-prohibited
+// characters or even be valid UTF8, we run saslprep if possible and otherwise
+// return the raw password.
+fn normalize(pass: &[u8]) -> Vec<u8> {
+    let pass = match str::from_utf8(pass) {
+        Ok(pass) => pass,
+        Err(_) => return pass.to_vec(),
+    };
+
+    match stringprep::saslprep(pass) {
+        Ok(pass) => pass.into_owned().into_bytes(),
+        Err(_) => pass.as_bytes().to_vec(),
+    }
+}
+
+pub(crate) async fn hi(str: &[u8], salt: &[u8], iterations: u32) -> [u8; 32] {
+    let mut hmac =
+        Hmac::<Sha256>::new_from_slice(str).expect("HMAC is able to accept all key sizes");
+    hmac.update(salt);
+    hmac.update(&[0, 0, 0, 1]);
+    let mut prev = hmac.finalize().into_bytes();
+
+    let mut hi = prev;
+
+    for i in 1..iterations {
+        let mut hmac = Hmac::<Sha256>::new_from_slice(str).expect("already checked above");
+        hmac.update(&prev);
+        prev = hmac.finalize().into_bytes();
+
+        for (hi, prev) in hi.iter_mut().zip(prev) {
+            *hi ^= prev;
+        }
+        // yield every ~250us
+        // hopefully reduces tail latencies
+        if i % 1024 == 0 {
+            yield_now().await
+        }
+    }
+
+    hi.into()
+}
+
+enum ChannelBindingInner {
+    Unrequested,
+    Unsupported,
+    TlsServerEndPoint(Vec<u8>),
+}
+
+/// The channel binding configuration for a SCRAM authentication exchange.
+pub struct ChannelBinding(ChannelBindingInner);
+
+impl ChannelBinding {
+    /// The server did not request channel binding.
+    pub fn unrequested() -> ChannelBinding {
+        ChannelBinding(ChannelBindingInner::Unrequested)
+    }
+
+    /// The server requested channel binding but the client is unable to provide it.
+    pub fn unsupported() -> ChannelBinding {
+        ChannelBinding(ChannelBindingInner::Unsupported)
+    }
+
+    /// The server requested channel binding and the client will use the `tls-server-end-point`
+    /// method.
+    pub fn tls_server_end_point(signature: Vec<u8>) -> ChannelBinding {
+        ChannelBinding(ChannelBindingInner::TlsServerEndPoint(signature))
+    }
+
+    fn gs2_header(&self) -> &'static str {
+        match self.0 {
+            ChannelBindingInner::Unrequested => "y,,",
+            ChannelBindingInner::Unsupported => "n,,",
+            ChannelBindingInner::TlsServerEndPoint(_) => "p=tls-server-end-point,,",
+        }
+    }
+
+    fn cbind_data(&self) -> &[u8] {
+        match self.0 {
+            ChannelBindingInner::Unrequested | ChannelBindingInner::Unsupported => &[],
+            ChannelBindingInner::TlsServerEndPoint(ref buf) => buf,
+        }
+    }
+}
+
+/// A pair of keys for the SCRAM-SHA-256 mechanism.
+/// See <https://datatracker.ietf.org/doc/html/rfc5802#section-3> for details.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct ScramKeys<const N: usize> {
+    /// Used by server to authenticate client.
+    pub client_key: [u8; N],
+    /// Used by client to verify server's signature.
+    pub server_key: [u8; N],
+}
+
+/// Password or keys which were derived from it.
+enum Credentials<const N: usize> {
+    /// A regular password as a vector of bytes.
+    Password(Vec<u8>),
+    /// A precomputed pair of keys.
+    Keys(Box<ScramKeys<N>>),
+}
+
+enum State {
+    Update {
+        nonce: String,
+        password: Credentials<32>,
+        channel_binding: ChannelBinding,
+    },
+    Finish {
+        server_key: [u8; 32],
+        auth_message: String,
+    },
+    Done,
+}
+
+/// A type which handles the client side of the SCRAM-SHA-256/SCRAM-SHA-256-PLUS authentication
+/// process.
+///
+/// During the authentication process, if the backend sends an `AuthenticationSASL` message which
+/// includes `SCRAM-SHA-256` as an authentication mechanism, this type can be used.
+///
+/// After a `ScramSha256` is constructed, the buffer returned by the `message()` method should be
+/// sent to the backend in a `SASLInitialResponse` message along with the mechanism name.
+///
+/// The server will reply with an `AuthenticationSASLContinue` message. Its contents should be
+/// passed to the `update()` method, after which the buffer returned by the `message()` method
+/// should be sent to the backend in a `SASLResponse` message.
+///
+/// The server will reply with an `AuthenticationSASLFinal` message. Its contents should be passed
+/// to the `finish()` method, after which the authentication process is complete.
+pub struct ScramSha256 {
+    message: String,
+    state: State,
+}
+
+fn nonce() -> String {
+    // rand 0.5's ThreadRng is cryptographically secure
+    let mut rng = rand::thread_rng();
+    (0..NONCE_LENGTH)
+        .map(|_| {
+            let mut v = rng.gen_range(0x21u8..0x7e);
+            if v == 0x2c {
+                v = 0x7e
+            }
+            v as char
+        })
+        .collect()
+}
+
+impl ScramSha256 {
+    /// Constructs a new instance which will use the provided password for authentication.
+    pub fn new(password: &[u8], channel_binding: ChannelBinding) -> ScramSha256 {
+        let password = Credentials::Password(normalize(password));
+        ScramSha256::new_inner(password, channel_binding, nonce())
+    }
+
+    /// Constructs a new instance which will use the provided key pair for authentication.
+    pub fn new_with_keys(keys: ScramKeys<32>, channel_binding: ChannelBinding) -> ScramSha256 {
+        let password = Credentials::Keys(keys.into());
+        ScramSha256::new_inner(password, channel_binding, nonce())
+    }
+
+    fn new_inner(
+        password: Credentials<32>,
+        channel_binding: ChannelBinding,
+        nonce: String,
+    ) -> ScramSha256 {
+        ScramSha256 {
+            message: format!("{}n=,r={}", channel_binding.gs2_header(), nonce),
+            state: State::Update {
+                nonce,
+                password,
+                channel_binding,
+            },
+        }
+    }
+
+    /// Returns the message which should be sent to the backend in an `SASLResponse` message.
+    pub fn message(&self) -> &[u8] {
+        if let State::Done = self.state {
+            panic!("invalid SCRAM state");
+        }
+        self.message.as_bytes()
+    }
+
+    /// Updates the state machine with the response from the backend.
+    ///
+    /// This should be called when an `AuthenticationSASLContinue` message is received.
+    pub async fn update(&mut self, message: &[u8]) -> io::Result<()> {
+        let (client_nonce, password, channel_binding) =
+            match mem::replace(&mut self.state, State::Done) {
+                State::Update {
+                    nonce,
+                    password,
+                    channel_binding,
+                } => (nonce, password, channel_binding),
+                _ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")),
+            };
+
+        let message =
+            str::from_utf8(message).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
+
+        let parsed = Parser::new(message).server_first_message()?;
+
+        if !parsed.nonce.starts_with(&client_nonce) {
+            return Err(io::Error::new(io::ErrorKind::InvalidInput, "invalid nonce"));
+        }
+
+        let (client_key, server_key) = match password {
+            Credentials::Password(password) => {
+                let salt = match base64::decode(parsed.salt) {
+                    Ok(salt) => salt,
+                    Err(e) => return Err(io::Error::new(io::ErrorKind::InvalidInput, e)),
+                };
+
+                let salted_password = hi(&password, &salt, parsed.iteration_count).await;
+
+                let make_key = |name| {
+                    let mut hmac = Hmac::<Sha256>::new_from_slice(&salted_password)
+                        .expect("HMAC is able to accept all key sizes");
+                    hmac.update(name);
+
+                    let mut key = [0u8; 32];
+                    key.copy_from_slice(hmac.finalize().into_bytes().as_slice());
+                    key
+                };
+
+                (make_key(b"Client Key"), make_key(b"Server Key"))
+            }
+            Credentials::Keys(keys) => (keys.client_key, keys.server_key),
+        };
+
+        let mut hash = Sha256::default();
+        hash.update(client_key);
+        let stored_key = hash.finalize_fixed();
+
+        let mut cbind_input = vec![];
+        cbind_input.extend(channel_binding.gs2_header().as_bytes());
+        cbind_input.extend(channel_binding.cbind_data());
+        let cbind_input = base64::encode(&cbind_input);
+
+        self.message.clear();
+        write!(&mut self.message, "c={},r={}", cbind_input, parsed.nonce).unwrap();
+
+        let auth_message = format!("n=,r={},{},{}", client_nonce, message, self.message);
+
+        let mut hmac = Hmac::<Sha256>::new_from_slice(&stored_key)
+            .expect("HMAC is able to accept all key sizes");
+        hmac.update(auth_message.as_bytes());
+        let client_signature = hmac.finalize().into_bytes();
+
+        let mut client_proof = client_key;
+        for (proof, signature) in client_proof.iter_mut().zip(client_signature) {
+            *proof ^= signature;
+        }
+
+        write!(&mut self.message, ",p={}", base64::encode(client_proof)).unwrap();
+
+        self.state = State::Finish {
+            server_key,
+            auth_message,
+        };
+        Ok(())
+    }
+
+    /// Finalizes the authentication process.
+    ///
+    /// This should be called when the backend sends an `AuthenticationSASLFinal` message.
+    /// Authentication has only succeeded if this method returns `Ok(())`.
+    pub fn finish(&mut self, message: &[u8]) -> io::Result<()> {
+        let (server_key, auth_message) = match mem::replace(&mut self.state, State::Done) {
+            State::Finish {
+                server_key,
+                auth_message,
+            } => (server_key, auth_message),
+            _ => return Err(io::Error::new(io::ErrorKind::Other, "invalid SCRAM state")),
+        };
+
+        let message =
+            str::from_utf8(message).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))?;
+
+        let parsed = Parser::new(message).server_final_message()?;
+
+        let verifier = match parsed {
+            ServerFinalMessage::Error(e) => {
+                return Err(io::Error::new(
+                    io::ErrorKind::Other,
+                    format!("SCRAM error: {}", e),
+                ));
+            }
+            ServerFinalMessage::Verifier(verifier) => verifier,
+        };
+
+        let verifier = match base64::decode(verifier) {
+            Ok(verifier) => verifier,
+            Err(e) => return Err(io::Error::new(io::ErrorKind::InvalidInput, e)),
+        };
+
+        let mut hmac = Hmac::<Sha256>::new_from_slice(&server_key)
+            .expect("HMAC is able to accept all key sizes");
+        hmac.update(auth_message.as_bytes());
+        hmac.verify_slice(&verifier)
+            .map_err(|_| io::Error::new(io::ErrorKind::InvalidInput, "SCRAM verification error"))
+    }
+}
+
+struct Parser<'a> {
+    s: &'a str,
+    it: iter::Peekable<str::CharIndices<'a>>,
+}
+
+impl<'a> Parser<'a> {
+    fn new(s: &'a str) -> Parser<'a> {
+        Parser {
+            s,
+            it: s.char_indices().peekable(),
+        }
+    }
+
+    fn eat(&mut self, target: char) -> io::Result<()> {
+        match self.it.next() {
+            Some((_, c)) if c == target => Ok(()),
+            Some((i, c)) => {
+                let m = format!(
+                    "unexpected character at byte {}: expected `{}` but got `{}",
+                    i, target, c
+                );
+                Err(io::Error::new(io::ErrorKind::InvalidInput, m))
+            }
+            None => Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "unexpected EOF",
+            )),
+        }
+    }
+
+    fn take_while<F>(&mut self, f: F) -> io::Result<&'a str>
+    where
+        F: Fn(char) -> bool,
+    {
+        let start = match self.it.peek() {
+            Some(&(i, _)) => i,
+            None => return Ok(""),
+        };
+
+        loop {
+            match self.it.peek() {
+                Some(&(_, c)) if f(c) => {
+                    self.it.next();
+                }
+                Some(&(i, _)) => return Ok(&self.s[start..i]),
+                None => return Ok(&self.s[start..]),
+            }
+        }
+    }
+
+    fn printable(&mut self) -> io::Result<&'a str> {
+        self.take_while(|c| matches!(c, '\x21'..='\x2b' | '\x2d'..='\x7e'))
+    }
+
+    fn nonce(&mut self) -> io::Result<&'a str> {
+        self.eat('r')?;
+        self.eat('=')?;
+        self.printable()
+    }
+
+    fn base64(&mut self) -> io::Result<&'a str> {
+        self.take_while(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '/' | '+' | '='))
+    }
+
+    fn salt(&mut self) -> io::Result<&'a str> {
+        self.eat('s')?;
+        self.eat('=')?;
+        self.base64()
+    }
+
+    fn posit_number(&mut self) -> io::Result<u32> {
+        let n = self.take_while(|c| c.is_ascii_digit())?;
+        n.parse()
+            .map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))
+    }
+
+    fn iteration_count(&mut self) -> io::Result<u32> {
+        self.eat('i')?;
+        self.eat('=')?;
+        self.posit_number()
+    }
+
+    fn eof(&mut self) -> io::Result<()> {
+        match self.it.peek() {
+            Some(&(i, _)) => Err(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                format!("unexpected trailing data at byte {}", i),
+            )),
+            None => Ok(()),
+        }
+    }
+
+    fn server_first_message(&mut self) -> io::Result<ServerFirstMessage<'a>> {
+        let nonce = self.nonce()?;
+        self.eat(',')?;
+        let salt = self.salt()?;
+        self.eat(',')?;
+        let iteration_count = self.iteration_count()?;
+        self.eof()?;
+
+        Ok(ServerFirstMessage {
+            nonce,
+            salt,
+            iteration_count,
+        })
+    }
+
+    fn value(&mut self) -> io::Result<&'a str> {
+        self.take_while(|c| matches!(c, '\0' | '=' | ','))
+    }
+
+    fn server_error(&mut self) -> io::Result<Option<&'a str>> {
+        match self.it.peek() {
+            Some(&(_, 'e')) => {}
+            _ => return Ok(None),
+        }
+
+        self.eat('e')?;
+        self.eat('=')?;
+        self.value().map(Some)
+    }
+
+    fn verifier(&mut self) -> io::Result<&'a str> {
+        self.eat('v')?;
+        self.eat('=')?;
+        self.base64()
+    }
+
+    fn server_final_message(&mut self) -> io::Result<ServerFinalMessage<'a>> {
+        let message = match self.server_error()? {
+            Some(error) => ServerFinalMessage::Error(error),
+            None => ServerFinalMessage::Verifier(self.verifier()?),
+        };
+        self.eof()?;
+        Ok(message)
+    }
+}
+
+struct ServerFirstMessage<'a> {
+    nonce: &'a str,
+    salt: &'a str,
+    iteration_count: u32,
+}
+
+enum ServerFinalMessage<'a> {
+    Error(&'a str),
+    Verifier(&'a str),
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn parse_server_first_message() {
+        let message = "r=fyko+d2lbbFgONRv9qkxdawL3rfcNHYJY1ZVvWVs7j,s=QSXCR+Q6sek8bf92,i=4096";
+        let message = Parser::new(message).server_first_message().unwrap();
+        assert_eq!(message.nonce, "fyko+d2lbbFgONRv9qkxdawL3rfcNHYJY1ZVvWVs7j");
+        assert_eq!(message.salt, "QSXCR+Q6sek8bf92");
+        assert_eq!(message.iteration_count, 4096);
+    }
+
+    // recorded auth exchange from psql
+    #[tokio::test]
+    async fn exchange() {
+        let password = "foobar";
+        let nonce = "9IZ2O01zb9IgiIZ1WJ/zgpJB";
+
+        let client_first = "n,,n=,r=9IZ2O01zb9IgiIZ1WJ/zgpJB";
+        let server_first =
+            "r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,s=fs3IXBy7U7+IvVjZ,i\
+             =4096";
+        let client_final =
+            "c=biws,r=9IZ2O01zb9IgiIZ1WJ/zgpJBjx/oIRLs02gGSHcw1KEty3eY,p=AmNKosjJzS3\
+             1NTlQYNs5BTeQjdHdk7lOflDo5re2an8=";
+        let server_final = "v=U+ppxD5XUKtradnv8e2MkeupiA8FU87Sg8CXzXHDAzw=";
+
+        let mut scram = ScramSha256::new_inner(
+            Credentials::Password(normalize(password.as_bytes())),
+            ChannelBinding::unsupported(),
+            nonce.to_string(),
+        );
+        assert_eq!(str::from_utf8(scram.message()).unwrap(), client_first);
+
+        scram.update(server_first.as_bytes()).await.unwrap();
+        assert_eq!(str::from_utf8(scram.message()).unwrap(), client_final);
+
+        scram.finish(server_final.as_bytes()).unwrap();
+    }
+}
diff --git a/libs/proxy/postgres-protocol2/src/escape/mod.rs b/libs/proxy/postgres-protocol2/src/escape/mod.rs
new file mode 100644
index 0000000000..0ba7efdcac
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/escape/mod.rs
@@ -0,0 +1,93 @@
+//! Provides functions for escaping literals and identifiers for use
+//! in SQL queries.
+//!
+//! Prefer parameterized queries where possible. Do not escape
+//! parameters in a parameterized query.
+
+#[cfg(test)]
+mod test;
+
+/// Escape a literal and surround result with single quotes. Not
+/// recommended in most cases.
+///
+/// If input contains backslashes, result will be of the form `
+/// E'...'` so it is safe to use regardless of the setting of
+/// standard_conforming_strings.
+pub fn escape_literal(input: &str) -> String {
+    escape_internal(input, false)
+}
+
+/// Escape an identifier and surround result with double quotes.
+pub fn escape_identifier(input: &str) -> String {
+    escape_internal(input, true)
+}
+
+// Translation of PostgreSQL libpq's PQescapeInternal(). Does not
+// require a connection because input string is known to be valid
+// UTF-8.
+//
+// Escape arbitrary strings.  If as_ident is true, we escape the
+// result as an identifier; if false, as a literal.  The result is
+// returned in a newly allocated buffer.  If we fail due to an
+// encoding violation or out of memory condition, we return NULL,
+// storing an error message into conn.
+fn escape_internal(input: &str, as_ident: bool) -> String {
+    let mut num_backslashes = 0;
+    let mut num_quotes = 0;
+    let quote_char = if as_ident { '"' } else { '\'' };
+
+    // Scan the string for characters that must be escaped.
+    for ch in input.chars() {
+        if ch == quote_char {
+            num_quotes += 1;
+        } else if ch == '\\' {
+            num_backslashes += 1;
+        }
+    }
+
+    // Allocate output String.
+    let mut result_size = input.len() + num_quotes + 3; // two quotes, plus a NUL
+    if !as_ident && num_backslashes > 0 {
+        result_size += num_backslashes + 2;
+    }
+
+    let mut output = String::with_capacity(result_size);
+
+    // If we are escaping a literal that contains backslashes, we use
+    // the escape string syntax so that the result is correct under
+    // either value of standard_conforming_strings.  We also emit a
+    // leading space in this case, to guard against the possibility
+    // that the result might be interpolated immediately following an
+    // identifier.
+    if !as_ident && num_backslashes > 0 {
+        output.push(' ');
+        output.push('E');
+    }
+
+    // Opening quote.
+    output.push(quote_char);
+
+    // Use fast path if possible.
+    //
+    // We've already verified that the input string is well-formed in
+    // the current encoding.  If it contains no quotes and, in the
+    // case of literal-escaping, no backslashes, then we can just copy
+    // it directly to the output buffer, adding the necessary quotes.
+    //
+    // If not, we must rescan the input and process each character
+    // individually.
+    if num_quotes == 0 && (num_backslashes == 0 || as_ident) {
+        output.push_str(input);
+    } else {
+        for ch in input.chars() {
+            if ch == quote_char || (!as_ident && ch == '\\') {
+                output.push(ch);
+            }
+            output.push(ch);
+        }
+    }
+
+    output.push(quote_char);
+
+    output
+}
diff --git a/libs/proxy/postgres-protocol2/src/escape/test.rs b/libs/proxy/postgres-protocol2/src/escape/test.rs
new file mode 100644
index 0000000000..4816a103b7
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/escape/test.rs
@@ -0,0 +1,17 @@
+use crate::escape::{escape_identifier, escape_literal};
+
+#[test]
+fn test_escape_idenifier() {
+    assert_eq!(escape_identifier("foo"), String::from("\"foo\""));
+    assert_eq!(escape_identifier("f\\oo"), String::from("\"f\\oo\""));
+    assert_eq!(escape_identifier("f'oo"), String::from("\"f'oo\""));
+    assert_eq!(escape_identifier("f\"oo"), String::from("\"f\"\"oo\""));
+}
+
+#[test]
+fn test_escape_literal() {
+    assert_eq!(escape_literal("foo"), String::from("'foo'"));
+    assert_eq!(escape_literal("f\\oo"), String::from(" E'f\\\\oo'"));
+    assert_eq!(escape_literal("f'oo"), String::from("'f''oo'"));
+    assert_eq!(escape_literal("f\"oo"), String::from("'f\"oo'"));
+}
diff --git a/libs/proxy/postgres-protocol2/src/lib.rs b/libs/proxy/postgres-protocol2/src/lib.rs
new file mode 100644
index 0000000000..947f2f835d
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/lib.rs
@@ -0,0 +1,78 @@
+//! Low level Postgres protocol APIs.
+//!
+//! This crate implements the low level components of Postgres's communication
+//! protocol, including message and value serialization and deserialization.
+//! It is designed to be used as a building block by higher level APIs such as
+//! `rust-postgres`, and should not typically be used directly.
+//!
+//! # Note
+//!
+//! This library assumes that the `client_encoding` backend parameter has been
+//! set to `UTF8`. It will most likely not behave properly if that is not the case.
+#![doc(html_root_url = "https://docs.rs/postgres-protocol/0.6")]
+#![warn(missing_docs, rust_2018_idioms, clippy::all)]
+
+use byteorder::{BigEndian, ByteOrder};
+use bytes::{BufMut, BytesMut};
+use std::io;
+
+pub mod authentication;
+pub mod escape;
+pub mod message;
+pub mod password;
+pub mod types;
+
+/// A Postgres OID.
+pub type Oid = u32;
+
+/// A Postgres Log Sequence Number (LSN).
+pub type Lsn = u64;
+
+/// An enum indicating if a value is `NULL` or not.
+pub enum IsNull {
+    /// The value is `NULL`.
+    Yes,
+    /// The value is not `NULL`.
+    No,
+}
+
+fn write_nullable<F, E>(serializer: F, buf: &mut BytesMut) -> Result<(), E>
+where
+    F: FnOnce(&mut BytesMut) -> Result<IsNull, E>,
+    E: From<io::Error>,
+{
+    let base = buf.len();
+    buf.put_i32(0);
+    let size = match serializer(buf)? {
+        IsNull::No => i32::from_usize(buf.len() - base - 4)?,
+        IsNull::Yes => -1,
+    };
+    BigEndian::write_i32(&mut buf[base..], size);
+
+    Ok(())
+}
+
+trait FromUsize: Sized {
+    fn from_usize(x: usize) -> Result<Self, io::Error>;
+}
+
+macro_rules! from_usize {
+    ($t:ty) => {
+        impl FromUsize for $t {
+            #[inline]
+            fn from_usize(x: usize) -> io::Result<$t> {
+                if x > <$t>::MAX as usize {
+                    Err(io::Error::new(
+                        io::ErrorKind::InvalidInput,
+                        "value too large to transmit",
+                    ))
+                } else {
+                    Ok(x as $t)
+                }
+            }
+        }
+    };
+}
+
+from_usize!(i16);
+from_usize!(i32);
diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs
new file mode 100644
index 0000000000..356d142f3f
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/message/backend.rs
@@ -0,0 +1,766 @@
+#![allow(missing_docs)]
+
+use byteorder::{BigEndian, ByteOrder, ReadBytesExt};
+use bytes::{Bytes, BytesMut};
+use fallible_iterator::FallibleIterator;
+use memchr::memchr;
+use std::cmp;
+use std::io::{self, Read};
+use std::ops::Range;
+use std::str;
+
+use crate::Oid;
+
+// top-level message tags
+const PARSE_COMPLETE_TAG: u8 = b'1';
+const BIND_COMPLETE_TAG: u8 = b'2';
+const CLOSE_COMPLETE_TAG: u8 = b'3';
+pub const NOTIFICATION_RESPONSE_TAG: u8 = b'A';
+const COPY_DONE_TAG: u8 = b'c';
+const COMMAND_COMPLETE_TAG: u8 = b'C';
+const COPY_DATA_TAG: u8 = b'd';
+const DATA_ROW_TAG: u8 = b'D';
+const ERROR_RESPONSE_TAG: u8 = b'E';
+const COPY_IN_RESPONSE_TAG: u8 = b'G';
+const COPY_OUT_RESPONSE_TAG: u8 = b'H';
+const COPY_BOTH_RESPONSE_TAG: u8 = b'W';
+const EMPTY_QUERY_RESPONSE_TAG: u8 = b'I';
+const BACKEND_KEY_DATA_TAG: u8 = b'K';
+pub const NO_DATA_TAG: u8 = b'n';
+pub const NOTICE_RESPONSE_TAG: u8 = b'N';
+const AUTHENTICATION_TAG: u8 = b'R';
+const PORTAL_SUSPENDED_TAG: u8 = b's';
+pub const PARAMETER_STATUS_TAG: u8 = b'S';
+const PARAMETER_DESCRIPTION_TAG: u8 = b't';
+const ROW_DESCRIPTION_TAG: u8 = b'T';
+pub const READY_FOR_QUERY_TAG: u8 = b'Z';
+
+#[derive(Debug, Copy, Clone)]
+pub struct Header {
+    tag: u8,
+    len: i32,
+}
+
+#[allow(clippy::len_without_is_empty)]
+impl Header {
+    #[inline]
+    pub fn parse(buf: &[u8]) -> io::Result<Option<Header>> {
+        if buf.len() < 5 {
+            return Ok(None);
+        }
+
+        let tag = buf[0];
+        let len = BigEndian::read_i32(&buf[1..]);
+
+        if len < 4 {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidData,
+                "invalid message length: header length < 4",
+            ));
+        }
+
+        Ok(Some(Header { tag, len }))
+    }
+
+    #[inline]
+    pub fn tag(self) -> u8 {
+        self.tag
+    }
+
+    #[inline]
+    pub fn len(self) -> i32 {
+        self.len
+    }
+}
+
+/// An enum representing Postgres backend messages.
+#[non_exhaustive]
+pub enum Message {
+    AuthenticationCleartextPassword,
+    AuthenticationGss,
+    AuthenticationKerberosV5,
+    AuthenticationMd5Password(AuthenticationMd5PasswordBody),
+    AuthenticationOk,
+    AuthenticationScmCredential,
+    AuthenticationSspi,
+    AuthenticationGssContinue,
+    AuthenticationSasl(AuthenticationSaslBody),
+    AuthenticationSaslContinue(AuthenticationSaslContinueBody),
+    AuthenticationSaslFinal(AuthenticationSaslFinalBody),
+    BackendKeyData(BackendKeyDataBody),
+    BindComplete,
+    CloseComplete,
+    CommandComplete(CommandCompleteBody),
+    CopyData,
+    CopyDone,
+    CopyInResponse,
+    CopyOutResponse,
+    CopyBothResponse,
+    DataRow(DataRowBody),
+    EmptyQueryResponse,
+    ErrorResponse(ErrorResponseBody),
+    NoData,
+    NoticeResponse(NoticeResponseBody),
+    NotificationResponse(NotificationResponseBody),
+    ParameterDescription(ParameterDescriptionBody),
+    ParameterStatus(ParameterStatusBody),
+    ParseComplete,
+    PortalSuspended,
+    ReadyForQuery(ReadyForQueryBody),
+    RowDescription(RowDescriptionBody),
+}
+
+impl Message {
+    #[inline]
+    pub fn parse(buf: &mut BytesMut) -> io::Result<Option<Message>> {
+        if buf.len() < 5 {
+            let to_read = 5 - buf.len();
+            buf.reserve(to_read);
+            return Ok(None);
+        }
+
+        let tag = buf[0];
+        let len = (&buf[1..5]).read_u32::<BigEndian>().unwrap();
+
+        if len < 4 {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                "invalid message length: parsing u32",
+            ));
+        }
+
+        let total_len = len as usize + 1;
+        if buf.len() < total_len {
+            let to_read = total_len - buf.len();
+            buf.reserve(to_read);
+            return Ok(None);
+        }
+
+        let mut buf = Buffer {
+            bytes: buf.split_to(total_len).freeze(),
+            idx: 5,
+        };
+
+        let message = match tag {
+            PARSE_COMPLETE_TAG => Message::ParseComplete,
+            BIND_COMPLETE_TAG => Message::BindComplete,
+            CLOSE_COMPLETE_TAG => Message::CloseComplete,
+            NOTIFICATION_RESPONSE_TAG => {
+                let process_id = buf.read_i32::<BigEndian>()?;
+                let channel = buf.read_cstr()?;
+                let message = buf.read_cstr()?;
+                Message::NotificationResponse(NotificationResponseBody {
+                    process_id,
+                    channel,
+                    message,
+                })
+            }
+            COPY_DONE_TAG => Message::CopyDone,
+            COMMAND_COMPLETE_TAG => {
+                let tag = buf.read_cstr()?;
+                Message::CommandComplete(CommandCompleteBody { tag })
+            }
+            COPY_DATA_TAG => Message::CopyData,
+            DATA_ROW_TAG => {
+                let len = buf.read_u16::<BigEndian>()?;
+                let storage = buf.read_all();
+                Message::DataRow(DataRowBody { storage, len })
+            }
+            ERROR_RESPONSE_TAG => {
+                let storage = buf.read_all();
+                Message::ErrorResponse(ErrorResponseBody { storage })
+            }
+            COPY_IN_RESPONSE_TAG => Message::CopyInResponse,
+            COPY_OUT_RESPONSE_TAG => Message::CopyOutResponse,
+            COPY_BOTH_RESPONSE_TAG => Message::CopyBothResponse,
+            EMPTY_QUERY_RESPONSE_TAG => Message::EmptyQueryResponse,
+            BACKEND_KEY_DATA_TAG => {
+                let process_id = buf.read_i32::<BigEndian>()?;
+                let secret_key = buf.read_i32::<BigEndian>()?;
+                Message::BackendKeyData(BackendKeyDataBody {
+                    process_id,
+                    secret_key,
+                })
+            }
+            NO_DATA_TAG => Message::NoData,
+            NOTICE_RESPONSE_TAG => {
+                let storage = buf.read_all();
+                Message::NoticeResponse(NoticeResponseBody { storage })
+            }
+            AUTHENTICATION_TAG => match buf.read_i32::<BigEndian>()? {
+                0 => Message::AuthenticationOk,
+                2 => Message::AuthenticationKerberosV5,
+                3 => Message::AuthenticationCleartextPassword,
+                5 => {
+                    let mut salt = [0; 4];
+                    buf.read_exact(&mut salt)?;
+                    Message::AuthenticationMd5Password(AuthenticationMd5PasswordBody { salt })
+                }
+                6 => Message::AuthenticationScmCredential,
+                7 => Message::AuthenticationGss,
+                8 => Message::AuthenticationGssContinue,
+                9 => Message::AuthenticationSspi,
+                10 => {
+                    let storage = buf.read_all();
+                    Message::AuthenticationSasl(AuthenticationSaslBody(storage))
+                }
+                11 => {
+                    let storage = buf.read_all();
+                    Message::AuthenticationSaslContinue(AuthenticationSaslContinueBody(storage))
+                }
+                12 => {
+                    let storage = buf.read_all();
+                    Message::AuthenticationSaslFinal(AuthenticationSaslFinalBody(storage))
+                }
+                tag => {
+                    return Err(io::Error::new(
+                        io::ErrorKind::InvalidInput,
+                        format!("unknown authentication tag `{}`", tag),
+                    ));
+                }
+            },
+            PORTAL_SUSPENDED_TAG => Message::PortalSuspended,
+            PARAMETER_STATUS_TAG => {
+                let name = buf.read_cstr()?;
+                let value = buf.read_cstr()?;
+                Message::ParameterStatus(ParameterStatusBody { name, value })
+            }
+            PARAMETER_DESCRIPTION_TAG => {
+                let len = buf.read_u16::<BigEndian>()?;
+                let storage = buf.read_all();
+                Message::ParameterDescription(ParameterDescriptionBody { storage, len })
+            }
+            ROW_DESCRIPTION_TAG => {
+                let len = buf.read_u16::<BigEndian>()?;
+                let storage = buf.read_all();
+                Message::RowDescription(RowDescriptionBody { storage, len })
+            }
+            READY_FOR_QUERY_TAG => {
+                let status = buf.read_u8()?;
+                Message::ReadyForQuery(ReadyForQueryBody { status })
+            }
+            tag => {
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidInput,
+                    format!("unknown message tag `{}`", tag),
+                ));
+            }
+        };
+
+        if !buf.is_empty() {
+            return Err(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                "invalid message length: expected buffer to be empty",
+            ));
+        }
+
+        Ok(Some(message))
+    }
+}
+
+struct Buffer {
+    bytes: Bytes,
+    idx: usize,
+}
+
+impl Buffer {
+    #[inline]
+    fn slice(&self) -> &[u8] {
+        &self.bytes[self.idx..]
+    }
+
+    #[inline]
+    fn is_empty(&self) -> bool {
+        self.slice().is_empty()
+    }
+
+    #[inline]
+    fn read_cstr(&mut self) -> io::Result<Bytes> {
+        match memchr(0, self.slice()) {
+            Some(pos) => {
+                let start = self.idx;
+                let end = start + pos;
+                let cstr = self.bytes.slice(start..end);
+                self.idx = end + 1;
+                Ok(cstr)
+            }
+            None => Err(io::Error::new(
+                io::ErrorKind::UnexpectedEof,
+                "unexpected EOF",
+            )),
+        }
+    }
+
+    #[inline]
+    fn read_all(&mut self) -> Bytes {
+        let buf = self.bytes.slice(self.idx..);
+        self.idx = self.bytes.len();
+        buf
+    }
+}
+
+impl Read for Buffer {
+    #[inline]
+    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
+        let len = {
+            let slice = self.slice();
+            let len = cmp::min(slice.len(), buf.len());
+            buf[..len].copy_from_slice(&slice[..len]);
+            len
+        };
+        self.idx += len;
+        Ok(len)
+    }
+}
+
+pub struct AuthenticationMd5PasswordBody {
+    salt: [u8; 4],
+}
+
+impl AuthenticationMd5PasswordBody {
+    #[inline]
+    pub fn salt(&self) -> [u8; 4] {
+        self.salt
+    }
+}
+
+pub struct AuthenticationSaslBody(Bytes);
+
+impl AuthenticationSaslBody {
+    #[inline]
+    pub fn mechanisms(&self) -> SaslMechanisms<'_> {
+        SaslMechanisms(&self.0)
+    }
+}
+
+pub struct SaslMechanisms<'a>(&'a [u8]);
+
+impl<'a> FallibleIterator for SaslMechanisms<'a> {
+    type Item = &'a str;
+    type Error = io::Error;
+
+    #[inline]
+    fn next(&mut self) -> io::Result<Option<&'a str>> {
+        let value_end = find_null(self.0, 0)?;
+        if value_end == 0 {
+            if self.0.len() != 1 {
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidData,
+                    "invalid message length: expected to be at end of iterator for sasl",
+                ));
+            }
+            Ok(None)
+        } else {
+            let value = get_str(&self.0[..value_end])?;
+            self.0 = &self.0[value_end + 1..];
+            Ok(Some(value))
+        }
+    }
+}
+
+pub struct AuthenticationSaslContinueBody(Bytes);
+
+impl AuthenticationSaslContinueBody {
+    #[inline]
+    pub fn data(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+pub struct AuthenticationSaslFinalBody(Bytes);
+
+impl AuthenticationSaslFinalBody {
+    #[inline]
+    pub fn data(&self) -> &[u8] {
+        &self.0
+    }
+}
+
+pub struct BackendKeyDataBody {
+    process_id: i32,
+    secret_key: i32,
+}
+
+impl BackendKeyDataBody {
+    #[inline]
+    pub fn process_id(&self) -> i32 {
+        self.process_id
+    }
+
+    #[inline]
+    pub fn secret_key(&self) -> i32 {
+        self.secret_key
+    }
+}
+
+pub struct CommandCompleteBody {
+    tag: Bytes,
+}
+
+impl CommandCompleteBody {
+    #[inline]
+    pub fn tag(&self) -> io::Result<&str> {
+        get_str(&self.tag)
+    }
+}
+
+#[derive(Debug)]
+pub struct DataRowBody {
+    storage: Bytes,
+    len: u16,
+}
+
+impl DataRowBody {
+    #[inline]
+    pub fn ranges(&self) -> DataRowRanges<'_> {
+        DataRowRanges {
+            buf: &self.storage,
+            len: self.storage.len(),
+            remaining: self.len,
+        }
+    }
+
+    #[inline]
+    pub fn buffer(&self) -> &[u8] {
+        &self.storage
+    }
+}
+
+pub struct DataRowRanges<'a> {
+    buf: &'a [u8],
+    len: usize,
+    remaining: u16,
+}
+
+impl FallibleIterator for DataRowRanges<'_> {
+    type Item = Option<Range<usize>>;
+    type Error = io::Error;
+
+    #[inline]
+    fn next(&mut self) -> io::Result<Option<Option<Range<usize>>>> {
+        if self.remaining == 0 {
+            if self.buf.is_empty() {
+                return Ok(None);
+            } else {
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidInput,
+                    "invalid message length: datarowrange is not empty",
+                ));
+            }
+        }
+
+        self.remaining -= 1;
+        let len = self.buf.read_i32::<BigEndian>()?;
+        if len < 0 {
+            Ok(Some(None))
+        } else {
+            let len = len as usize;
+            if self.buf.len() < len {
+                return Err(io::Error::new(
+                    io::ErrorKind::UnexpectedEof,
+                    "unexpected EOF",
+                ));
+            }
+            let base = self.len - self.buf.len();
+            self.buf = &self.buf[len..];
+            Ok(Some(Some(base..base + len)))
+        }
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let len = self.remaining as usize;
+        (len, Some(len))
+    }
+}
+
+pub struct ErrorResponseBody {
+    storage: Bytes,
+}
+
+impl ErrorResponseBody {
+    #[inline]
+    pub fn fields(&self) -> ErrorFields<'_> {
+        ErrorFields { buf: &self.storage }
+    }
+}
+
+pub struct ErrorFields<'a> {
+    buf: &'a [u8],
+}
+
+impl<'a> FallibleIterator for ErrorFields<'a> {
+    type Item = ErrorField<'a>;
+    type Error = io::Error;
+
+    #[inline]
+    fn next(&mut self) -> io::Result<Option<ErrorField<'a>>> {
+        let type_ = self.buf.read_u8()?;
+        if type_ == 0 {
+            if self.buf.is_empty() {
+                return Ok(None);
+            } else {
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidInput,
+                    "invalid message length: error fields is not drained",
+                ));
+            }
+        }
+
+        let value_end = find_null(self.buf, 0)?;
+        let value = get_str(&self.buf[..value_end])?;
+        self.buf = &self.buf[value_end + 1..];
+
+        Ok(Some(ErrorField { type_, value }))
+    }
+}
+
+pub struct ErrorField<'a> {
+    type_: u8,
+    value: &'a str,
+}
+
+impl ErrorField<'_> {
+    #[inline]
+    pub fn type_(&self) -> u8 {
+        self.type_
+    }
+
+    #[inline]
+    pub fn value(&self) -> &str {
+        self.value
+    }
+}
+
+pub struct NoticeResponseBody {
+    storage: Bytes,
+}
+
+impl NoticeResponseBody {
+    #[inline]
+    pub fn fields(&self) -> ErrorFields<'_> {
+        ErrorFields { buf: &self.storage }
+    }
+}
+
+pub struct NotificationResponseBody {
+    process_id: i32,
+    channel: Bytes,
+    message: Bytes,
+}
+
+impl NotificationResponseBody {
+    #[inline]
+    pub fn process_id(&self) -> i32 {
+        self.process_id
+    }
+
+    #[inline]
+    pub fn channel(&self) -> io::Result<&str> {
+        get_str(&self.channel)
+    }
+
+    #[inline]
+    pub fn message(&self) -> io::Result<&str> {
+        get_str(&self.message)
+    }
+}
+
+pub struct ParameterDescriptionBody {
+    storage: Bytes,
+    len: u16,
+}
+
+impl ParameterDescriptionBody {
+    #[inline]
+    pub fn parameters(&self) -> Parameters<'_> {
+        Parameters {
+            buf: &self.storage,
+            remaining: self.len,
+        }
+    }
+}
+
+pub struct Parameters<'a> {
+    buf: &'a [u8],
+    remaining: u16,
+}
+
+impl FallibleIterator for Parameters<'_> {
+    type Item = Oid;
+    type Error = io::Error;
+
+    #[inline]
+    fn next(&mut self) -> io::Result<Option<Oid>> {
+        if self.remaining == 0 {
+            if self.buf.is_empty() {
+                return Ok(None);
+            } else {
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidInput,
+                    "invalid message length: parameters is not drained",
+                ));
+            }
+        }
+
+        self.remaining -= 1;
+        self.buf.read_u32::<BigEndian>().map(Some)
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let len = self.remaining as usize;
+        (len, Some(len))
+    }
+}
+
+pub struct ParameterStatusBody {
+    name: Bytes,
+    value: Bytes,
+}
+
+impl ParameterStatusBody {
+    #[inline]
+    pub fn name(&self) -> io::Result<&str> {
+        get_str(&self.name)
+    }
+
+    #[inline]
+    pub fn value(&self) -> io::Result<&str> {
+        get_str(&self.value)
+    }
+}
+
+pub struct ReadyForQueryBody {
+    status: u8,
+}
+
+impl ReadyForQueryBody {
+    #[inline]
+    pub fn status(&self) -> u8 {
+        self.status
+    }
+}
+
+pub struct RowDescriptionBody {
+    storage: Bytes,
+    len: u16,
+}
+
+impl RowDescriptionBody {
+    #[inline]
+    pub fn fields(&self) -> Fields<'_> {
+        Fields {
+            buf: &self.storage,
+            remaining: self.len,
+        }
+    }
+}
+
+pub struct Fields<'a> {
+    buf: &'a [u8],
+    remaining: u16,
+}
+
+impl<'a> FallibleIterator for Fields<'a> {
+    type Item = Field<'a>;
+    type Error = io::Error;
+
+    #[inline]
+    fn next(&mut self) -> io::Result<Option<Field<'a>>> {
+        if self.remaining == 0 {
+            if self.buf.is_empty() {
+                return Ok(None);
+            } else {
+                return Err(io::Error::new(
+                    io::ErrorKind::InvalidInput,
+                    "invalid message length: field is not drained",
+                ));
+            }
+        }
+
+        self.remaining -= 1;
+        let name_end = find_null(self.buf, 0)?;
+        let name = get_str(&self.buf[..name_end])?;
+        self.buf = &self.buf[name_end + 1..];
+        let table_oid = self.buf.read_u32::<BigEndian>()?;
+        let column_id = self.buf.read_i16::<BigEndian>()?;
+        let type_oid = self.buf.read_u32::<BigEndian>()?;
+        let type_size = self.buf.read_i16::<BigEndian>()?;
+        let type_modifier = self.buf.read_i32::<BigEndian>()?;
+        let format = self.buf.read_i16::<BigEndian>()?;
+
+        Ok(Some(Field {
+            name,
+            table_oid,
+            column_id,
+            type_oid,
+            type_size,
+            type_modifier,
+            format,
+        }))
+    }
+}
+
+pub struct Field<'a> {
+    name: &'a str,
+    table_oid: Oid,
+    column_id: i16,
+    type_oid: Oid,
+    type_size: i16,
+    type_modifier: i32,
+    format: i16,
+}
+
+impl<'a> Field<'a> {
+    #[inline]
+    pub fn name(&self) -> &'a str {
+        self.name
+    }
+
+    #[inline]
+    pub fn table_oid(&self) -> Oid {
+        self.table_oid
+    }
+
+    #[inline]
+    pub fn column_id(&self) -> i16 {
+        self.column_id
+    }
+
+    #[inline]
+    pub fn type_oid(&self) -> Oid {
+        self.type_oid
+    }
+
+    #[inline]
+    pub fn type_size(&self) -> i16 {
+        self.type_size
+    }
+
+    #[inline]
+    pub fn type_modifier(&self) -> i32 {
+        self.type_modifier
+    }
+
+    #[inline]
+    pub fn format(&self) -> i16 {
+        self.format
+    }
+}
+
+#[inline]
+fn find_null(buf: &[u8], start: usize) -> io::Result<usize> {
+    match memchr(0, &buf[start..]) {
+        Some(pos) => Ok(pos + start),
+        None => Err(io::Error::new(
+            io::ErrorKind::UnexpectedEof,
+            "unexpected EOF",
+        )),
+    }
+}
+
+#[inline]
+fn get_str(buf: &[u8]) -> io::Result<&str> {
+    str::from_utf8(buf).map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))
+}
diff --git a/libs/proxy/postgres-protocol2/src/message/frontend.rs b/libs/proxy/postgres-protocol2/src/message/frontend.rs
new file mode 100644
index 0000000000..5d0a8ff8c8
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs
@@ -0,0 +1,297 @@
+//! Frontend message serialization.
+#![allow(missing_docs)]
+
+use byteorder::{BigEndian, ByteOrder};
+use bytes::{Buf, BufMut, BytesMut};
+use std::convert::TryFrom;
+use std::error::Error;
+use std::io;
+use std::marker;
+
+use crate::{write_nullable, FromUsize, IsNull, Oid};
+
+#[inline]
+fn write_body<F, E>(buf: &mut BytesMut, f: F) -> Result<(), E>
+where
+    F: FnOnce(&mut BytesMut) -> Result<(), E>,
+    E: From<io::Error>,
+{
+    let base = buf.len();
+    buf.extend_from_slice(&[0; 4]);
+
+    f(buf)?;
+
+    let size = i32::from_usize(buf.len() - base)?;
+    BigEndian::write_i32(&mut buf[base..], size);
+    Ok(())
+}
+
+pub enum BindError {
+    Conversion(Box<dyn Error + marker::Sync + Send>),
+    Serialization(io::Error),
+}
+
+impl From<Box<dyn Error + marker::Sync + Send>> for BindError {
+    #[inline]
+    fn from(e: Box<dyn Error + marker::Sync + Send>) -> BindError {
+        BindError::Conversion(e)
+    }
+}
+
+impl From<io::Error> for BindError {
+    #[inline]
+    fn from(e: io::Error) -> BindError {
+        BindError::Serialization(e)
+    }
+}
+
+#[inline]
+pub fn bind<I, J, F, T, K>(
+    portal: &str,
+    statement: &str,
+    formats: I,
+    values: J,
+    mut serializer: F,
+    result_formats: K,
+    buf: &mut BytesMut,
+) -> Result<(), BindError>
+where
+    I: IntoIterator<Item = i16>,
+    J: IntoIterator<Item = T>,
+    F: FnMut(T, &mut BytesMut) -> Result<IsNull, Box<dyn Error + marker::Sync + Send>>,
+    K: IntoIterator<Item = i16>,
+{
+    buf.put_u8(b'B');
+
+    write_body(buf, |buf| {
+        write_cstr(portal.as_bytes(), buf)?;
+        write_cstr(statement.as_bytes(), buf)?;
+        write_counted(
+            formats,
+            |f, buf| {
+                buf.put_i16(f);
+                Ok::<_, io::Error>(())
+            },
+            buf,
+        )?;
+        write_counted(
+            values,
+            |v, buf| write_nullable(|buf| serializer(v, buf), buf),
+            buf,
+        )?;
+        write_counted(
+            result_formats,
+            |f, buf| {
+                buf.put_i16(f);
+                Ok::<_, io::Error>(())
+            },
+            buf,
+        )?;
+
+        Ok(())
+    })
+}
+
+#[inline]
+fn write_counted<I, T, F, E>(items: I, mut serializer: F, buf: &mut BytesMut) -> Result<(), E>
+where
+    I: IntoIterator<Item = T>,
+    F: FnMut(T, &mut BytesMut) -> Result<(), E>,
+    E: From<io::Error>,
+{
+    let base = buf.len();
+    buf.extend_from_slice(&[0; 2]);
+    let mut count = 0;
+    for item in items {
+        serializer(item, buf)?;
+        count += 1;
+    }
+    let count = i16::from_usize(count)?;
+    BigEndian::write_i16(&mut buf[base..], count);
+
+    Ok(())
+}
+
+#[inline]
+pub fn cancel_request(process_id: i32, secret_key: i32, buf: &mut BytesMut) {
+    write_body(buf, |buf| {
+        buf.put_i32(80_877_102);
+        buf.put_i32(process_id);
+        buf.put_i32(secret_key);
+        Ok::<_, io::Error>(())
+    })
+    .unwrap();
+}
+
+#[inline]
+pub fn close(variant: u8, name: &str, buf: &mut BytesMut) -> io::Result<()> {
+    buf.put_u8(b'C');
+    write_body(buf, |buf| {
+        buf.put_u8(variant);
+        write_cstr(name.as_bytes(), buf)
+    })
+}
+
+pub struct CopyData<T> {
+    buf: T,
+    len: i32,
+}
+
+impl<T> CopyData<T>
+where
+    T: Buf,
+{
+    pub fn new(buf: T) -> io::Result<CopyData<T>> {
+        let len = buf
+            .remaining()
+            .checked_add(4)
+            .and_then(|l| i32::try_from(l).ok())
+            .ok_or_else(|| {
+                io::Error::new(io::ErrorKind::InvalidInput, "message length overflow")
+            })?;
+
+        Ok(CopyData { buf, len })
+    }
+
+    pub fn write(self, out: &mut BytesMut) {
+        out.put_u8(b'd');
+        out.put_i32(self.len);
+        out.put(self.buf);
+    }
+}
+
+#[inline]
+pub fn copy_done(buf: &mut BytesMut) {
+    buf.put_u8(b'c');
+    write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
+}
+
+#[inline]
+pub fn copy_fail(message: &str, buf: &mut BytesMut) -> io::Result<()> {
+    buf.put_u8(b'f');
+    write_body(buf, |buf| write_cstr(message.as_bytes(), buf))
+}
+
+#[inline]
+pub fn describe(variant: u8, name: &str, buf: &mut BytesMut) -> io::Result<()> {
+    buf.put_u8(b'D');
+    write_body(buf, |buf| {
+        buf.put_u8(variant);
+        write_cstr(name.as_bytes(), buf)
+    })
+}
+
+#[inline]
+pub fn execute(portal: &str, max_rows: i32, buf: &mut BytesMut) -> io::Result<()> {
+    buf.put_u8(b'E');
+    write_body(buf, |buf| {
+        write_cstr(portal.as_bytes(), buf)?;
+        buf.put_i32(max_rows);
+        Ok(())
+    })
+}
+
+#[inline]
+pub fn parse<I>(name: &str, query: &str, param_types: I, buf: &mut BytesMut) -> io::Result<()>
+where
+    I: IntoIterator<Item = Oid>,
+{
+    buf.put_u8(b'P');
+    write_body(buf, |buf| {
+        write_cstr(name.as_bytes(), buf)?;
+        write_cstr(query.as_bytes(), buf)?;
+        write_counted(
+            param_types,
+            |t, buf| {
+                buf.put_u32(t);
+                Ok::<_, io::Error>(())
+            },
+            buf,
+        )?;
+        Ok(())
+    })
+}
+
+#[inline]
+pub fn password_message(password: &[u8], buf: &mut BytesMut) -> io::Result<()> {
+    buf.put_u8(b'p');
+    write_body(buf, |buf| write_cstr(password, buf))
+}
+
+#[inline]
+pub fn query(query: &str, buf: &mut BytesMut) -> io::Result<()> {
+    buf.put_u8(b'Q');
+    write_body(buf, |buf| write_cstr(query.as_bytes(), buf))
+}
+
+#[inline]
+pub fn sasl_initial_response(mechanism: &str, data: &[u8], buf: &mut BytesMut) -> io::Result<()> {
+    buf.put_u8(b'p');
+    write_body(buf, |buf| {
+        write_cstr(mechanism.as_bytes(), buf)?;
+        let len = i32::from_usize(data.len())?;
+        buf.put_i32(len);
+        buf.put_slice(data);
+        Ok(())
+    })
+}
+
+#[inline]
+pub fn sasl_response(data: &[u8], buf: &mut BytesMut) -> io::Result<()> {
+    buf.put_u8(b'p');
+    write_body(buf, |buf| {
+        buf.put_slice(data);
+        Ok(())
+    })
+}
+
+#[inline]
+pub fn ssl_request(buf: &mut BytesMut) {
+    write_body(buf, |buf| {
+        buf.put_i32(80_877_103);
+        Ok::<_, io::Error>(())
+    })
+    .unwrap();
+}
+
+#[inline]
+pub fn startup_message<'a, I>(parameters: I, buf: &mut BytesMut) -> io::Result<()>
+where
+    I: IntoIterator<Item = (&'a str, &'a str)>,
+{
+    write_body(buf, |buf| {
+        // postgres protocol version 3.0(196608) in bigger-endian
+        buf.put_i32(0x00_03_00_00);
+        for (key, value) in parameters {
+            write_cstr(key.as_bytes(), buf)?;
+            write_cstr(value.as_bytes(), buf)?;
+        }
+        buf.put_u8(0);
+        Ok(())
+    })
+}
+
+#[inline]
+pub fn sync(buf: &mut BytesMut) {
+    buf.put_u8(b'S');
+    write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
+}
+
+#[inline]
+pub fn terminate(buf: &mut BytesMut) {
+    buf.put_u8(b'X');
+    write_body(buf, |_| Ok::<(), io::Error>(())).unwrap();
+}
+
+#[inline]
+fn write_cstr(s: &[u8], buf: &mut BytesMut) -> Result<(), io::Error> {
+    if s.contains(&0) {
+        return Err(io::Error::new(
+            io::ErrorKind::InvalidInput,
+            "string contains embedded null",
+        ));
+    }
+    buf.put_slice(s);
+    buf.put_u8(0);
+    Ok(())
+}
diff --git a/libs/proxy/postgres-protocol2/src/message/mod.rs b/libs/proxy/postgres-protocol2/src/message/mod.rs
new file mode 100644
index 0000000000..9e5d997548
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/message/mod.rs
@@ -0,0 +1,8 @@
+//! Postgres message protocol support.
+//!
+//! See [Postgres's documentation][docs] for more information on message flow.
+//!
+//! [docs]: https://www.postgresql.org/docs/9.5/static/protocol-flow.html
+
+pub mod backend;
+pub mod frontend;
diff --git a/libs/proxy/postgres-protocol2/src/password/mod.rs b/libs/proxy/postgres-protocol2/src/password/mod.rs
new file mode 100644
index 0000000000..e669e80f3f
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/password/mod.rs
@@ -0,0 +1,107 @@
+//! Functions to encrypt a password in the client.
+//!
+//! This is intended to be used by client applications that wish to
+//! send commands like `ALTER USER joe PASSWORD 'pwd'`. The password
+//! need not be sent in cleartext if it is encrypted on the client
+//! side. This is good because it ensures the cleartext password won't
+//! end up in logs pg_stat displays, etc.
+
+use crate::authentication::sasl;
+use hmac::{Hmac, Mac};
+use md5::Md5;
+use rand::RngCore;
+use sha2::digest::FixedOutput;
+use sha2::{Digest, Sha256};
+
+#[cfg(test)]
+mod test;
+
+const SCRAM_DEFAULT_ITERATIONS: u32 = 4096;
+const SCRAM_DEFAULT_SALT_LEN: usize = 16;
+
+/// Hash password using SCRAM-SHA-256 with a randomly-generated
+/// salt.
+///
+/// The client may assume the returned string doesn't contain any
+/// special characters that would require escaping in an SQL command.
+pub async fn scram_sha_256(password: &[u8]) -> String {
+    let mut salt: [u8; SCRAM_DEFAULT_SALT_LEN] = [0; SCRAM_DEFAULT_SALT_LEN];
+    let mut rng = rand::thread_rng();
+    rng.fill_bytes(&mut salt);
+    scram_sha_256_salt(password, salt).await
+}
+
+// Internal implementation of scram_sha_256 with a caller-provided
+// salt. This is useful for testing.
+pub(crate) async fn scram_sha_256_salt(
+    password: &[u8],
+    salt: [u8; SCRAM_DEFAULT_SALT_LEN],
+) -> String {
+    // Prepare the password, per [RFC
+    // 4013](https://tools.ietf.org/html/rfc4013), if possible.
+    //
+    // Postgres treats passwords as byte strings (without embedded NUL
+    // bytes), but SASL expects passwords to be valid UTF-8.
+    //
+    // Follow the behavior of libpq's PQencryptPasswordConn(), and
+    // also the backend. If the password is not valid UTF-8, or if it
+    // contains prohibited characters (such as non-ASCII whitespace),
+    // just skip the SASLprep step and use the original byte
+    // sequence.
+    let prepared: Vec<u8> = match std::str::from_utf8(password) {
+        Ok(password_str) => {
+            match stringprep::saslprep(password_str) {
+                Ok(p) => p.into_owned().into_bytes(),
+                // contains invalid characters; skip saslprep
+                Err(_) => Vec::from(password),
+            }
+        }
+        // not valid UTF-8; skip saslprep
+        Err(_) => Vec::from(password),
+    };
+
+    // salt password
+    let salted_password = sasl::hi(&prepared, &salt, SCRAM_DEFAULT_ITERATIONS).await;
+
+    // client key
+    let mut hmac = Hmac::<Sha256>::new_from_slice(&salted_password)
+        .expect("HMAC is able to accept all key sizes");
+    hmac.update(b"Client Key");
+    let client_key = hmac.finalize().into_bytes();
+
+    // stored key
+    let mut hash = Sha256::default();
+    hash.update(client_key.as_slice());
+    let stored_key = hash.finalize_fixed();
+
+    // server key
+    let mut hmac = Hmac::<Sha256>::new_from_slice(&salted_password)
+        .expect("HMAC is able to accept all key sizes");
+    hmac.update(b"Server Key");
+    let server_key = hmac.finalize().into_bytes();
+
+    format!(
+        "SCRAM-SHA-256${}:{}${}:{}",
+        SCRAM_DEFAULT_ITERATIONS,
+        base64::encode(salt),
+        base64::encode(stored_key),
+        base64::encode(server_key)
+    )
+}
+
+/// **Not recommended, as MD5 is not considered to be secure.**
+///
+/// Hash password using MD5 with the username as the salt.
+///
+/// The client may assume the returned string doesn't contain any
+/// special characters that would require escaping.
+pub fn md5(password: &[u8], username: &str) -> String {
+    // salt password with username
+    let mut salted_password = Vec::from(password);
+    salted_password.extend_from_slice(username.as_bytes());
+
+    let mut hash = Md5::new();
+    hash.update(&salted_password);
+    let digest = hash.finalize();
+    format!("md5{:x}", digest)
+}
diff --git a/libs/proxy/postgres-protocol2/src/password/test.rs b/libs/proxy/postgres-protocol2/src/password/test.rs
new file mode 100644
index 0000000000..c9d340f09d
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/password/test.rs
@@ -0,0 +1,19 @@
+use crate::password;
+
+#[tokio::test]
+async fn test_encrypt_scram_sha_256() {
+    // Specify the salt to make the test deterministic. Any bytes will do.
+    let salt: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
+    assert_eq!(
+        password::scram_sha_256_salt(b"secret", salt).await,
+        "SCRAM-SHA-256$4096:AQIDBAUGBwgJCgsMDQ4PEA==$8rrDg00OqaiWXJ7p+sCgHEIaBSHY89ZJl3mfIsf32oY=:05L1f+yZbiN8O0AnO40Og85NNRhvzTS57naKRWCcsIA="
+    );
+}
+
+#[test]
+fn test_encrypt_md5() {
+    assert_eq!(
+        password::md5(b"secret", "foo"),
+        "md54ab2c5d00339c4b2a4e921d2dc4edec7"
+    );
+}
diff --git a/libs/proxy/postgres-protocol2/src/types/mod.rs b/libs/proxy/postgres-protocol2/src/types/mod.rs
new file mode 100644
index 0000000000..78131c05bf
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/types/mod.rs
@@ -0,0 +1,294 @@
+//! Conversions to and from Postgres's binary format for various types.
+use byteorder::{BigEndian, ReadBytesExt};
+use bytes::{BufMut, BytesMut};
+use fallible_iterator::FallibleIterator;
+use std::boxed::Box as StdBox;
+use std::error::Error;
+use std::str;
+
+use crate::Oid;
+
+#[cfg(test)]
+mod test;
+
+/// Serializes a `TEXT`, `VARCHAR`, `CHAR(n)`, `NAME`, or `CITEXT` value.
+#[inline]
+pub fn text_to_sql(v: &str, buf: &mut BytesMut) {
+    buf.put_slice(v.as_bytes());
+}
+
+/// Deserializes a `TEXT`, `VARCHAR`, `CHAR(n)`, `NAME`, or `CITEXT` value.
+#[inline]
+pub fn text_from_sql(buf: &[u8]) -> Result<&str, StdBox<dyn Error + Sync + Send>> {
+    Ok(str::from_utf8(buf)?)
+}
+
+/// Deserializes a `"char"` value.
+#[inline]
+pub fn char_from_sql(mut buf: &[u8]) -> Result<i8, StdBox<dyn Error + Sync + Send>> {
+    let v = buf.read_i8()?;
+    if !buf.is_empty() {
+        return Err("invalid buffer size".into());
+    }
+    Ok(v)
+}
+
+/// Serializes an `OID` value.
+#[inline]
+pub fn oid_to_sql(v: Oid, buf: &mut BytesMut) {
+    buf.put_u32(v);
+}
+
+/// Deserializes an `OID` value.
+#[inline]
+pub fn oid_from_sql(mut buf: &[u8]) -> Result<Oid, StdBox<dyn Error + Sync + Send>> {
+    let v = buf.read_u32::<BigEndian>()?;
+    if !buf.is_empty() {
+        return Err("invalid buffer size".into());
+    }
+    Ok(v)
+}
+
+/// A fallible iterator over `HSTORE` entries.
+pub struct HstoreEntries<'a> {
+    remaining: i32,
+    buf: &'a [u8],
+}
+
+impl<'a> FallibleIterator for HstoreEntries<'a> {
+    type Item = (&'a str, Option<&'a str>);
+    type Error = StdBox<dyn Error + Sync + Send>;
+
+    #[inline]
+    #[allow(clippy::type_complexity)]
+    fn next(
+        &mut self,
+    ) -> Result<Option<(&'a str, Option<&'a str>)>, StdBox<dyn Error + Sync + Send>> {
+        if self.remaining == 0 {
+            if !self.buf.is_empty() {
+                return Err("invalid buffer size".into());
+            }
+            return Ok(None);
+        }
+
+        self.remaining -= 1;
+
+        let key_len = self.buf.read_i32::<BigEndian>()?;
+        if key_len < 0 {
+            return Err("invalid key length".into());
+        }
+        let (key, buf) = self.buf.split_at(key_len as usize);
+        let key = str::from_utf8(key)?;
+        self.buf = buf;
+
+        let value_len = self.buf.read_i32::<BigEndian>()?;
+        let value = if value_len < 0 {
+            None
+        } else {
+            let (value, buf) = self.buf.split_at(value_len as usize);
+            let value = str::from_utf8(value)?;
+            self.buf = buf;
+            Some(value)
+        };
+
+        Ok(Some((key, value)))
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let len = self.remaining as usize;
+        (len, Some(len))
+    }
+}
+
+/// Deserializes an array value.
+#[inline]
+pub fn array_from_sql(mut buf: &[u8]) -> Result<Array<'_>, StdBox<dyn Error + Sync + Send>> {
+    let dimensions = buf.read_i32::<BigEndian>()?;
+    if dimensions < 0 {
+        return Err("invalid dimension count".into());
+    }
+
+    let mut r = buf;
+    let mut elements = 1i32;
+    for _ in 0..dimensions {
+        let len = r.read_i32::<BigEndian>()?;
+        if len < 0 {
+            return Err("invalid dimension size".into());
+        }
+        let _lower_bound = r.read_i32::<BigEndian>()?;
+        elements = match elements.checked_mul(len) {
+            Some(elements) => elements,
+            None => return Err("too many array elements".into()),
+        };
+    }
+
+    if dimensions == 0 {
+        elements = 0;
+    }
+
+    Ok(Array {
+        dimensions,
+        elements,
+        buf,
+    })
+}
+
+/// A Postgres array.
+pub struct Array<'a> {
+    dimensions: i32,
+    elements: i32,
+    buf: &'a [u8],
+}
+
+impl<'a> Array<'a> {
+    /// Returns an iterator over the dimensions of the array.
+    #[inline]
+    pub fn dimensions(&self) -> ArrayDimensions<'a> {
+        ArrayDimensions(&self.buf[..self.dimensions as usize * 8])
+    }
+
+    /// Returns an iterator over the values of the array.
+    #[inline]
+    pub fn values(&self) -> ArrayValues<'a> {
+        ArrayValues {
+            remaining: self.elements,
+            buf: &self.buf[self.dimensions as usize * 8..],
+        }
+    }
+}
+
+/// An iterator over the dimensions of an array.
+pub struct ArrayDimensions<'a>(&'a [u8]);
+
+impl FallibleIterator for ArrayDimensions<'_> {
+    type Item = ArrayDimension;
+    type Error = StdBox<dyn Error + Sync + Send>;
+
+    #[inline]
+    fn next(&mut self) -> Result<Option<ArrayDimension>, StdBox<dyn Error + Sync + Send>> {
+        if self.0.is_empty() {
+            return Ok(None);
+        }
+
+        let len = self.0.read_i32::<BigEndian>()?;
+        let lower_bound = self.0.read_i32::<BigEndian>()?;
+
+        Ok(Some(ArrayDimension { len, lower_bound }))
+    }
+
+    #[inline]
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let len = self.0.len() / 8;
+        (len, Some(len))
+    }
+}
+
+/// Information about a dimension of an array.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub struct ArrayDimension {
+    /// The length of this dimension.
+    pub len: i32,
+
+    /// The base value used to index into this dimension.
+    pub lower_bound: i32,
+}
+
+/// An iterator over the values of an array, in row-major order.
+pub struct ArrayValues<'a> {
+    remaining: i32,
+    buf: &'a [u8],
+}
+
+impl<'a> FallibleIterator for ArrayValues<'a> {
+    type Item = Option<&'a [u8]>;
+    type Error = StdBox<dyn Error + Sync + Send>;
+
+    #[inline]
+    fn next(&mut self) -> Result<Option<Option<&'a [u8]>>, StdBox<dyn Error + Sync + Send>> {
+        if self.remaining == 0 {
+            if !self.buf.is_empty() {
+                return Err("invalid message length: arrayvalue not drained".into());
+            }
+            return Ok(None);
+        }
+        self.remaining -= 1;
+
+        let len = self.buf.read_i32::<BigEndian>()?;
+        let val = if len < 0 {
+            None
+        } else {
+            if self.buf.len() < len as usize {
+                return Err("invalid value length".into());
+            }
+
+            let (val, buf) = self.buf.split_at(len as usize);
+            self.buf = buf;
+            Some(val)
+        };
+
+        Ok(Some(val))
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        let len = self.remaining as usize;
+        (len, Some(len))
+    }
+}
+
+/// Serializes a Postgres ltree string
+#[inline]
+pub fn ltree_to_sql(v: &str, buf: &mut BytesMut) {
+    // A version number is prepended to an ltree string per spec
+    buf.put_u8(1);
+    // Append the rest of the query
+    buf.put_slice(v.as_bytes());
+}
+
+/// Deserialize a Postgres ltree string
+#[inline]
+pub fn ltree_from_sql(buf: &[u8]) -> Result<&str, StdBox<dyn Error + Sync + Send>> {
+    match buf {
+        // Remove the version number from the front of the ltree per spec
+        [1u8, rest @ ..] => Ok(str::from_utf8(rest)?),
+        _ => Err("ltree version 1 only supported".into()),
+    }
+}
+
+/// Serializes a Postgres lquery string
+#[inline]
+pub fn lquery_to_sql(v: &str, buf: &mut BytesMut) {
+    // A version number is prepended to an lquery string per spec
+    buf.put_u8(1);
+    // Append the rest of the query
+    buf.put_slice(v.as_bytes());
+}
+
+/// Deserialize a Postgres lquery string
+#[inline]
+pub fn lquery_from_sql(buf: &[u8]) -> Result<&str, StdBox<dyn Error + Sync + Send>> {
+    match buf {
+        // Remove the version number from the front of the lquery per spec
+        [1u8, rest @ ..] => Ok(str::from_utf8(rest)?),
+        _ => Err("lquery version 1 only supported".into()),
+    }
+}
+
+/// Serializes a Postgres ltxtquery string
+#[inline]
+pub fn ltxtquery_to_sql(v: &str, buf: &mut BytesMut) {
+    // A version number is prepended to an ltxtquery string per spec
+    buf.put_u8(1);
+    // Append the rest of the query
+    buf.put_slice(v.as_bytes());
+}
+
+/// Deserialize a Postgres ltxtquery string
+#[inline]
+pub fn ltxtquery_from_sql(buf: &[u8]) -> Result<&str, StdBox<dyn Error + Sync + Send>> {
+    match buf {
+        // Remove the version number from the front of the ltxtquery per spec
+        [1u8, rest @ ..] => Ok(str::from_utf8(rest)?),
+        _ => Err("ltxtquery version 1 only supported".into()),
+    }
+}
diff --git a/libs/proxy/postgres-protocol2/src/types/test.rs b/libs/proxy/postgres-protocol2/src/types/test.rs
new file mode 100644
index 0000000000..96cc055bc3
--- /dev/null
+++ b/libs/proxy/postgres-protocol2/src/types/test.rs
@@ -0,0 +1,87 @@
+use bytes::{Buf, BytesMut};
+
+use super::*;
+
+#[test]
+fn ltree_sql() {
+    let mut query = vec![1u8];
+    query.extend_from_slice("A.B.C".as_bytes());
+
+    let mut buf = BytesMut::new();
+
+    ltree_to_sql("A.B.C", &mut buf);
+
+    assert_eq!(query.as_slice(), buf.chunk());
+}
+
+#[test]
+fn ltree_str() {
+    let mut query = vec![1u8];
+    query.extend_from_slice("A.B.C".as_bytes());
+
+    assert!(ltree_from_sql(query.as_slice()).is_ok())
+}
+
+#[test]
+fn ltree_wrong_version() {
+    let mut query = vec![2u8];
+    query.extend_from_slice("A.B.C".as_bytes());
+
+    assert!(ltree_from_sql(query.as_slice()).is_err())
+}
+
+#[test]
+fn lquery_sql() {
+    let mut query = vec![1u8];
+    query.extend_from_slice("A.B.C".as_bytes());
+
+    let mut buf = BytesMut::new();
+
+    lquery_to_sql("A.B.C", &mut buf);
+
+    assert_eq!(query.as_slice(), buf.chunk());
+}
+
+#[test]
+fn lquery_str() {
+    let mut query = vec![1u8];
+    query.extend_from_slice("A.B.C".as_bytes());
+
+    assert!(lquery_from_sql(query.as_slice()).is_ok())
+}
+
+#[test]
+fn lquery_wrong_version() {
+    let mut query = vec![2u8];
+    query.extend_from_slice("A.B.C".as_bytes());
+
+    assert!(lquery_from_sql(query.as_slice()).is_err())
+}
+
+#[test]
+fn ltxtquery_sql() {
+    let mut query = vec![1u8];
+    query.extend_from_slice("a & b*".as_bytes());
+
+    let mut buf = BytesMut::new();
+
+    ltree_to_sql("a & b*", &mut buf);
+
+    assert_eq!(query.as_slice(), buf.chunk());
+}
+
+#[test]
+fn ltxtquery_str() {
+    let mut query = vec![1u8];
+    query.extend_from_slice("a & b*".as_bytes());
+
+    assert!(ltree_from_sql(query.as_slice()).is_ok())
+}
+
+#[test]
+fn ltxtquery_wrong_version() {
+    let mut query = vec![2u8];
+    query.extend_from_slice("a & b*".as_bytes());
+
+    assert!(ltree_from_sql(query.as_slice()).is_err())
+}
diff --git a/libs/proxy/postgres-types2/Cargo.toml b/libs/proxy/postgres-types2/Cargo.toml
new file mode 100644
index 0000000000..58cfb5571f
--- /dev/null
+++ b/libs/proxy/postgres-types2/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "postgres-types2"
+version = "0.1.0"
+edition = "2018"
+license = "MIT/Apache-2.0"
+
+[dependencies]
+bytes.workspace = true
+fallible-iterator.workspace = true
+postgres-protocol2 = { path = "../postgres-protocol2" }
diff --git a/libs/proxy/postgres-types2/src/lib.rs b/libs/proxy/postgres-types2/src/lib.rs
new file mode 100644
index 0000000000..18ba032151
--- /dev/null
+++ b/libs/proxy/postgres-types2/src/lib.rs
@@ -0,0 +1,477 @@
+//! Conversions to and from Postgres types.
+//!
+//! This crate is used by the `tokio-postgres` and `postgres` crates. You normally don't need to depend directly on it
+//! unless you want to define your own `ToSql` or `FromSql` definitions.
+#![doc(html_root_url = "https://docs.rs/postgres-types/0.2")]
+#![warn(clippy::all, rust_2018_idioms, missing_docs)]
+
+use fallible_iterator::FallibleIterator;
+use postgres_protocol2::types;
+use std::any::type_name;
+use std::error::Error;
+use std::fmt;
+use std::sync::Arc;
+
+use crate::type_gen::{Inner, Other};
+
+#[doc(inline)]
+pub use postgres_protocol2::Oid;
+
+use bytes::BytesMut;
+
+/// Generates a simple implementation of `ToSql::accepts` which accepts the
+/// types passed to it.
+macro_rules! accepts {
+    ($($expected:ident),+) => (
+        fn accepts(ty: &$crate::Type) -> bool {
+            matches!(*ty, $($crate::Type::$expected)|+)
+        }
+    )
+}
+
+/// Generates an implementation of `ToSql::to_sql_checked`.
+///
+/// All `ToSql` implementations should use this macro.
+macro_rules! to_sql_checked {
+    () => {
+        fn to_sql_checked(
+            &self,
+            ty: &$crate::Type,
+            out: &mut $crate::private::BytesMut,
+        ) -> ::std::result::Result<
+            $crate::IsNull,
+            Box<dyn ::std::error::Error + ::std::marker::Sync + ::std::marker::Send>,
+        > {
+            $crate::__to_sql_checked(self, ty, out)
+        }
+    };
+}
+
+// WARNING: this function is not considered part of this crate's public API.
+// It is subject to change at any time.
+#[doc(hidden)]
+pub fn __to_sql_checked<T>(
+    v: &T,
+    ty: &Type,
+    out: &mut BytesMut,
+) -> Result<IsNull, Box<dyn Error + Sync + Send>>
+where
+    T: ToSql,
+{
+    if !T::accepts(ty) {
+        return Err(Box::new(WrongType::new::<T>(ty.clone())));
+    }
+    v.to_sql(ty, out)
+}
+
+// mod pg_lsn;
+#[doc(hidden)]
+pub mod private;
+// mod special;
+mod type_gen;
+
+/// A Postgres type.
+#[derive(PartialEq, Eq, Clone, Hash)]
+pub struct Type(Inner);
+
+impl fmt::Debug for Type {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Debug::fmt(&self.0, fmt)
+    }
+}
+
+impl fmt::Display for Type {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self.schema() {
+            "public" | "pg_catalog" => {}
+            schema => write!(fmt, "{}.", schema)?,
+        }
+        fmt.write_str(self.name())
+    }
+}
+
+impl Type {
+    /// Creates a new `Type`.
+    pub fn new(name: String, oid: Oid, kind: Kind, schema: String) -> Type {
+        Type(Inner::Other(Arc::new(Other {
+            name,
+            oid,
+            kind,
+            schema,
+        })))
+    }
+
+    /// Returns the `Type` corresponding to the provided `Oid` if it
+    /// corresponds to a built-in type.
+    pub fn from_oid(oid: Oid) -> Option<Type> {
+        Inner::from_oid(oid).map(Type)
+    }
+
+    /// Returns the OID of the `Type`.
+    pub fn oid(&self) -> Oid {
+        self.0.oid()
+    }
+
+    /// Returns the kind of this type.
+    pub fn kind(&self) -> &Kind {
+        self.0.kind()
+    }
+
+    /// Returns the schema of this type.
+    pub fn schema(&self) -> &str {
+        match self.0 {
+            Inner::Other(ref u) => &u.schema,
+            _ => "pg_catalog",
+        }
+    }
+
+    /// Returns the name of this type.
+    pub fn name(&self) -> &str {
+        self.0.name()
+    }
+}
+
+/// Represents the kind of a Postgres type.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+#[non_exhaustive]
+pub enum Kind {
+    /// A simple type like `VARCHAR` or `INTEGER`.
+    Simple,
+    /// An enumerated type along with its variants.
+    Enum(Vec<String>),
+    /// A pseudo-type.
+    Pseudo,
+    /// An array type along with the type of its elements.
+    Array(Type),
+    /// A range type along with the type of its elements.
+    Range(Type),
+    /// A multirange type along with the type of its elements.
+    Multirange(Type),
+    /// A domain type along with its underlying type.
+    Domain(Type),
+    /// A composite type along with information about its fields.
+    Composite(Vec<Field>),
+}
+
+/// Information about a field of a composite type.
+#[derive(Debug, Clone, PartialEq, Eq, Hash)]
+pub struct Field {
+    name: String,
+    type_: Type,
+}
+
+impl Field {
+    /// Creates a new `Field`.
+    pub fn new(name: String, type_: Type) -> Field {
+        Field { name, type_ }
+    }
+
+    /// Returns the name of the field.
+    pub fn name(&self) -> &str {
+        &self.name
+    }
+
+    /// Returns the type of the field.
+    pub fn type_(&self) -> &Type {
+        &self.type_
+    }
+}
+
+/// An error indicating that a `NULL` Postgres value was passed to a `FromSql`
+/// implementation that does not support `NULL` values.
+#[derive(Debug, Clone, Copy)]
+pub struct WasNull;
+
+impl fmt::Display for WasNull {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt.write_str("a Postgres value was `NULL`")
+    }
+}
+
+impl Error for WasNull {}
+
+/// An error indicating that a conversion was attempted between incompatible
+/// Rust and Postgres types.
+#[derive(Debug)]
+pub struct WrongType {
+    postgres: Type,
+    rust: &'static str,
+}
+
+impl fmt::Display for WrongType {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            fmt,
+            "cannot convert between the Rust type `{}` and the Postgres type `{}`",
+            self.rust, self.postgres,
+        )
+    }
+}
+
+impl Error for WrongType {}
+
+impl WrongType {
+    /// Creates a new `WrongType` error.
+    pub fn new<T>(ty: Type) -> WrongType {
+        WrongType {
+            postgres: ty,
+            rust: type_name::<T>(),
+        }
+    }
+}
+
+/// An error indicating that a as_text conversion was attempted on a binary
+/// result.
+#[derive(Debug)]
+pub struct WrongFormat {}
+
+impl Error for WrongFormat {}
+
+impl fmt::Display for WrongFormat {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(
+            fmt,
+            "cannot read column as text while it is in binary format"
+        )
+    }
+}
+
+/// A trait for types that can be created from a Postgres value.
+pub trait FromSql<'a>: Sized {
+    /// Creates a new value of this type from a buffer of data of the specified
+    /// Postgres `Type` in its binary format.
+    ///
+    /// The caller of this method is responsible for ensuring that this type
+    /// is compatible with the Postgres `Type`.
+    fn from_sql(ty: &Type, raw: &'a [u8]) -> Result<Self, Box<dyn Error + Sync + Send>>;
+
+    /// Creates a new value of this type from a `NULL` SQL value.
+    ///
+    /// The caller of this method is responsible for ensuring that this type
+    /// is compatible with the Postgres `Type`.
+    ///
+    /// The default implementation returns `Err(Box::new(WasNull))`.
+    #[allow(unused_variables)]
+    fn from_sql_null(ty: &Type) -> Result<Self, Box<dyn Error + Sync + Send>> {
+        Err(Box::new(WasNull))
+    }
+
+    /// A convenience function that delegates to `from_sql` and `from_sql_null` depending on the
+    /// value of `raw`.
+    fn from_sql_nullable(
+        ty: &Type,
+        raw: Option<&'a [u8]>,
+    ) -> Result<Self, Box<dyn Error + Sync + Send>> {
+        match raw {
+            Some(raw) => Self::from_sql(ty, raw),
+            None => Self::from_sql_null(ty),
+        }
+    }
+
+    /// Determines if a value of this type can be created from the specified
+    /// Postgres `Type`.
+    fn accepts(ty: &Type) -> bool;
+}
+
+/// A trait for types which can be created from a Postgres value without borrowing any data.
+///
+/// This is primarily useful for trait bounds on functions.
+pub trait FromSqlOwned: for<'a> FromSql<'a> {}
+
+impl<T> FromSqlOwned for T where T: for<'a> FromSql<'a> {}
+
+impl<'a, T: FromSql<'a>> FromSql<'a> for Option<T> {
+    fn from_sql(ty: &Type, raw: &'a [u8]) -> Result<Option<T>, Box<dyn Error + Sync + Send>> {
+        <T as FromSql>::from_sql(ty, raw).map(Some)
+    }
+
+    fn from_sql_null(_: &Type) -> Result<Option<T>, Box<dyn Error + Sync + Send>> {
+        Ok(None)
+    }
+
+    fn accepts(ty: &Type) -> bool {
+        <T as FromSql>::accepts(ty)
+    }
+}
+
+impl<'a, T: FromSql<'a>> FromSql<'a> for Vec<T> {
+    fn from_sql(ty: &Type, raw: &'a [u8]) -> Result<Vec<T>, Box<dyn Error + Sync + Send>> {
+        let member_type = match *ty.kind() {
+            Kind::Array(ref member) => member,
+            _ => panic!("expected array type"),
+        };
+
+        let array = types::array_from_sql(raw)?;
+        if array.dimensions().count()? > 1 {
+            return Err("array contains too many dimensions".into());
+        }
+
+        array
+            .values()
+            .map(|v| T::from_sql_nullable(member_type, v))
+            .collect()
+    }
+
+    fn accepts(ty: &Type) -> bool {
+        match *ty.kind() {
+            Kind::Array(ref inner) => T::accepts(inner),
+            _ => false,
+        }
+    }
+}
+
+impl<'a> FromSql<'a> for String {
+    fn from_sql(ty: &Type, raw: &'a [u8]) -> Result<String, Box<dyn Error + Sync + Send>> {
+        <&str as FromSql>::from_sql(ty, raw).map(ToString::to_string)
+    }
+
+    fn accepts(ty: &Type) -> bool {
+        <&str as FromSql>::accepts(ty)
+    }
+}
+
+impl<'a> FromSql<'a> for &'a str {
+    fn from_sql(ty: &Type, raw: &'a [u8]) -> Result<&'a str, Box<dyn Error + Sync + Send>> {
+        match *ty {
+            ref ty if ty.name() == "ltree" => types::ltree_from_sql(raw),
+            ref ty if ty.name() == "lquery" => types::lquery_from_sql(raw),
+            ref ty if ty.name() == "ltxtquery" => types::ltxtquery_from_sql(raw),
+            _ => types::text_from_sql(raw),
+        }
+    }
+
+    fn accepts(ty: &Type) -> bool {
+        match *ty {
+            Type::VARCHAR | Type::TEXT | Type::BPCHAR | Type::NAME | Type::UNKNOWN => true,
+            ref ty
+                if (ty.name() == "citext"
+                    || ty.name() == "ltree"
+                    || ty.name() == "lquery"
+                    || ty.name() == "ltxtquery") =>
+            {
+                true
+            }
+            _ => false,
+        }
+    }
+}
+
+macro_rules! simple_from {
+    ($t:ty, $f:ident, $($expected:ident),+) => {
+        impl<'a> FromSql<'a> for $t {
+            fn from_sql(_: &Type, raw: &'a [u8]) -> Result<$t, Box<dyn Error + Sync + Send>> {
+                types::$f(raw)
+            }
+
+            accepts!($($expected),+);
+        }
+    }
+}
+
+simple_from!(i8, char_from_sql, CHAR);
+simple_from!(u32, oid_from_sql, OID);
+
+/// An enum representing the nullability of a Postgres value.
+pub enum IsNull {
+    /// The value is NULL.
+    Yes,
+    /// The value is not NULL.
+    No,
+}
+
+/// A trait for types that can be converted into Postgres values.
+pub trait ToSql: fmt::Debug {
+    /// Converts the value of `self` into the binary format of the specified
+    /// Postgres `Type`, appending it to `out`.
+    ///
+    /// The caller of this method is responsible for ensuring that this type
+    /// is compatible with the Postgres `Type`.
+    ///
+    /// The return value indicates if this value should be represented as
+    /// `NULL`. If this is the case, implementations **must not** write
+    /// anything to `out`.
+    fn to_sql(&self, ty: &Type, out: &mut BytesMut) -> Result<IsNull, Box<dyn Error + Sync + Send>>
+    where
+        Self: Sized;
+
+    /// Determines if a value of this type can be converted to the specified
+    /// Postgres `Type`.
+    fn accepts(ty: &Type) -> bool
+    where
+        Self: Sized;
+
+    /// An adaptor method used internally by Rust-Postgres.
+    ///
+    /// *All* implementations of this method should be generated by the
+    /// `to_sql_checked!()` macro.
+    fn to_sql_checked(
+        &self,
+        ty: &Type,
+        out: &mut BytesMut,
+    ) -> Result<IsNull, Box<dyn Error + Sync + Send>>;
+
+    /// Specify the encode format
+    fn encode_format(&self, _ty: &Type) -> Format {
+        Format::Binary
+    }
+}
+
+/// Supported Postgres message format types
+///
+/// Using Text format in a message assumes a Postgres `SERVER_ENCODING` of `UTF8`
+#[derive(Clone, Copy, Debug, PartialEq)]
+pub enum Format {
+    /// Text format (UTF-8)
+    Text,
+    /// Compact, typed binary format
+    Binary,
+}
+
+impl ToSql for &str {
+    fn to_sql(&self, ty: &Type, w: &mut BytesMut) -> Result<IsNull, Box<dyn Error + Sync + Send>> {
+        match *ty {
+            ref ty if ty.name() == "ltree" => types::ltree_to_sql(self, w),
+            ref ty if ty.name() == "lquery" => types::lquery_to_sql(self, w),
+            ref ty if ty.name() == "ltxtquery" => types::ltxtquery_to_sql(self, w),
+            _ => types::text_to_sql(self, w),
+        }
+        Ok(IsNull::No)
+    }
+
+    fn accepts(ty: &Type) -> bool {
+        match *ty {
+            Type::VARCHAR | Type::TEXT | Type::BPCHAR | Type::NAME | Type::UNKNOWN => true,
+            ref ty
+                if (ty.name() == "citext"
+                    || ty.name() == "ltree"
+                    || ty.name() == "lquery"
+                    || ty.name() == "ltxtquery") =>
+            {
+                true
+            }
+            _ => false,
+        }
+    }
+
+    to_sql_checked!();
+}
+
+macro_rules! simple_to {
+    ($t:ty, $f:ident, $($expected:ident),+) => {
+        impl ToSql for $t {
+            fn to_sql(&self,
+                      _: &Type,
+                      w: &mut BytesMut)
+                      -> Result<IsNull, Box<dyn Error + Sync + Send>> {
+                types::$f(*self, w);
+                Ok(IsNull::No)
+            }
+
+            accepts!($($expected),+);
+
+            to_sql_checked!();
+        }
+    }
+}
+
+simple_to!(u32, oid_to_sql, OID);
diff --git a/libs/proxy/postgres-types2/src/private.rs b/libs/proxy/postgres-types2/src/private.rs
new file mode 100644
index 0000000000..774f9a301c
--- /dev/null
+++ b/libs/proxy/postgres-types2/src/private.rs
@@ -0,0 +1,34 @@
+use crate::{FromSql, Type};
+pub use bytes::BytesMut;
+use std::error::Error;
+
+pub fn read_be_i32(buf: &mut &[u8]) -> Result<i32, Box<dyn Error + Sync + Send>> {
+    if buf.len() < 4 {
+        return Err("invalid buffer size".into());
+    }
+    let mut bytes = [0; 4];
+    bytes.copy_from_slice(&buf[..4]);
+    *buf = &buf[4..];
+    Ok(i32::from_be_bytes(bytes))
+}
+
+pub fn read_value<'a, T>(
+    type_: &Type,
+    buf: &mut &'a [u8],
+) -> Result<T, Box<dyn Error + Sync + Send>>
+where
+    T: FromSql<'a>,
+{
+    let len = read_be_i32(buf)?;
+    let value = if len < 0 {
+        None
+    } else {
+        if len as usize > buf.len() {
+            return Err("invalid buffer size".into());
+        }
+        let (head, tail) = buf.split_at(len as usize);
+        *buf = tail;
+        Some(head)
+    };
+    T::from_sql_nullable(type_, value)
+}
diff --git a/libs/proxy/postgres-types2/src/type_gen.rs b/libs/proxy/postgres-types2/src/type_gen.rs
new file mode 100644
index 0000000000..a1bc3f85c0
--- /dev/null
+++ b/libs/proxy/postgres-types2/src/type_gen.rs
@@ -0,0 +1,1524 @@
+// Autogenerated file - DO NOT EDIT
+use std::sync::Arc;
+
+use crate::{Kind, Oid, Type};
+
+#[derive(PartialEq, Eq, Debug, Hash)]
+pub struct Other {
+    pub name: String,
+    pub oid: Oid,
+    pub kind: Kind,
+    pub schema: String,
+}
+
+#[derive(PartialEq, Eq, Clone, Debug, Hash)]
+pub enum Inner {
+    Bool,
+    Bytea,
+    Char,
+    Name,
+    Int8,
+    Int2,
+    Int2Vector,
+    Int4,
+    Regproc,
+    Text,
+    Oid,
+    Tid,
+    Xid,
+    Cid,
+    OidVector,
+    PgDdlCommand,
+    Json,
+    Xml,
+    XmlArray,
+    PgNodeTree,
+    JsonArray,
+    TableAmHandler,
+    Xid8Array,
+    IndexAmHandler,
+    Point,
+    Lseg,
+    Path,
+    Box,
+    Polygon,
+    Line,
+    LineArray,
+    Cidr,
+    CidrArray,
+    Float4,
+    Float8,
+    Unknown,
+    Circle,
+    CircleArray,
+    Macaddr8,
+    Macaddr8Array,
+    Money,
+    MoneyArray,
+    Macaddr,
+    Inet,
+    BoolArray,
+    ByteaArray,
+    CharArray,
+    NameArray,
+    Int2Array,
+    Int2VectorArray,
+    Int4Array,
+    RegprocArray,
+    TextArray,
+    TidArray,
+    XidArray,
+    CidArray,
+    OidVectorArray,
+    BpcharArray,
+    VarcharArray,
+    Int8Array,
+    PointArray,
+    LsegArray,
+    PathArray,
+    BoxArray,
+    Float4Array,
+    Float8Array,
+    PolygonArray,
+    OidArray,
+    Aclitem,
+    AclitemArray,
+    MacaddrArray,
+    InetArray,
+    Bpchar,
+    Varchar,
+    Date,
+    Time,
+    Timestamp,
+    TimestampArray,
+    DateArray,
+    TimeArray,
+    Timestamptz,
+    TimestamptzArray,
+    Interval,
+    IntervalArray,
+    NumericArray,
+    CstringArray,
+    Timetz,
+    TimetzArray,
+    Bit,
+    BitArray,
+    Varbit,
+    VarbitArray,
+    Numeric,
+    Refcursor,
+    RefcursorArray,
+    Regprocedure,
+    Regoper,
+    Regoperator,
+    Regclass,
+    Regtype,
+    RegprocedureArray,
+    RegoperArray,
+    RegoperatorArray,
+    RegclassArray,
+    RegtypeArray,
+    Record,
+    Cstring,
+    Any,
+    Anyarray,
+    Void,
+    Trigger,
+    LanguageHandler,
+    Internal,
+    Anyelement,
+    RecordArray,
+    Anynonarray,
+    TxidSnapshotArray,
+    Uuid,
+    UuidArray,
+    TxidSnapshot,
+    FdwHandler,
+    PgLsn,
+    PgLsnArray,
+    TsmHandler,
+    PgNdistinct,
+    PgDependencies,
+    Anyenum,
+    TsVector,
+    Tsquery,
+    GtsVector,
+    TsVectorArray,
+    GtsVectorArray,
+    TsqueryArray,
+    Regconfig,
+    RegconfigArray,
+    Regdictionary,
+    RegdictionaryArray,
+    Jsonb,
+    JsonbArray,
+    AnyRange,
+    EventTrigger,
+    Int4Range,
+    Int4RangeArray,
+    NumRange,
+    NumRangeArray,
+    TsRange,
+    TsRangeArray,
+    TstzRange,
+    TstzRangeArray,
+    DateRange,
+    DateRangeArray,
+    Int8Range,
+    Int8RangeArray,
+    Jsonpath,
+    JsonpathArray,
+    Regnamespace,
+    RegnamespaceArray,
+    Regrole,
+    RegroleArray,
+    Regcollation,
+    RegcollationArray,
+    Int4multiRange,
+    NummultiRange,
+    TsmultiRange,
+    TstzmultiRange,
+    DatemultiRange,
+    Int8multiRange,
+    AnymultiRange,
+    AnycompatiblemultiRange,
+    PgBrinBloomSummary,
+    PgBrinMinmaxMultiSummary,
+    PgMcvList,
+    PgSnapshot,
+    PgSnapshotArray,
+    Xid8,
+    Anycompatible,
+    Anycompatiblearray,
+    Anycompatiblenonarray,
+    AnycompatibleRange,
+    Int4multiRangeArray,
+    NummultiRangeArray,
+    TsmultiRangeArray,
+    TstzmultiRangeArray,
+    DatemultiRangeArray,
+    Int8multiRangeArray,
+    Other(Arc<Other>),
+}
+
+impl Inner {
+    pub fn from_oid(oid: Oid) -> Option<Inner> {
+        match oid {
+            16 => Some(Inner::Bool),
+            17 => Some(Inner::Bytea),
+            18 => Some(Inner::Char),
+            19 => Some(Inner::Name),
+            20 => Some(Inner::Int8),
+            21 => Some(Inner::Int2),
+            22 => Some(Inner::Int2Vector),
+            23 => Some(Inner::Int4),
+            24 => Some(Inner::Regproc),
+            25 => Some(Inner::Text),
+            26 => Some(Inner::Oid),
+            27 => Some(Inner::Tid),
+            28 => Some(Inner::Xid),
+            29 => Some(Inner::Cid),
+            30 => Some(Inner::OidVector),
+            32 => Some(Inner::PgDdlCommand),
+            114 => Some(Inner::Json),
+            142 => Some(Inner::Xml),
+            143 => Some(Inner::XmlArray),
+            194 => Some(Inner::PgNodeTree),
+            199 => Some(Inner::JsonArray),
+            269 => Some(Inner::TableAmHandler),
+            271 => Some(Inner::Xid8Array),
+            325 => Some(Inner::IndexAmHandler),
+            600 => Some(Inner::Point),
+            601 => Some(Inner::Lseg),
+            602 => Some(Inner::Path),
+            603 => Some(Inner::Box),
+            604 => Some(Inner::Polygon),
+            628 => Some(Inner::Line),
+            629 => Some(Inner::LineArray),
+            650 => Some(Inner::Cidr),
+            651 => Some(Inner::CidrArray),
+            700 => Some(Inner::Float4),
+            701 => Some(Inner::Float8),
+            705 => Some(Inner::Unknown),
+            718 => Some(Inner::Circle),
+            719 => Some(Inner::CircleArray),
+            774 => Some(Inner::Macaddr8),
+            775 => Some(Inner::Macaddr8Array),
+            790 => Some(Inner::Money),
+            791 => Some(Inner::MoneyArray),
+            829 => Some(Inner::Macaddr),
+            869 => Some(Inner::Inet),
+            1000 => Some(Inner::BoolArray),
+            1001 => Some(Inner::ByteaArray),
+            1002 => Some(Inner::CharArray),
+            1003 => Some(Inner::NameArray),
+            1005 => Some(Inner::Int2Array),
+            1006 => Some(Inner::Int2VectorArray),
+            1007 => Some(Inner::Int4Array),
+            1008 => Some(Inner::RegprocArray),
+            1009 => Some(Inner::TextArray),
+            1010 => Some(Inner::TidArray),
+            1011 => Some(Inner::XidArray),
+            1012 => Some(Inner::CidArray),
+            1013 => Some(Inner::OidVectorArray),
+            1014 => Some(Inner::BpcharArray),
+            1015 => Some(Inner::VarcharArray),
+            1016 => Some(Inner::Int8Array),
+            1017 => Some(Inner::PointArray),
+            1018 => Some(Inner::LsegArray),
+            1019 => Some(Inner::PathArray),
+            1020 => Some(Inner::BoxArray),
+            1021 => Some(Inner::Float4Array),
+            1022 => Some(Inner::Float8Array),
+            1027 => Some(Inner::PolygonArray),
+            1028 => Some(Inner::OidArray),
+            1033 => Some(Inner::Aclitem),
+            1034 => Some(Inner::AclitemArray),
+            1040 => Some(Inner::MacaddrArray),
+            1041 => Some(Inner::InetArray),
+            1042 => Some(Inner::Bpchar),
+            1043 => Some(Inner::Varchar),
+            1082 => Some(Inner::Date),
+            1083 => Some(Inner::Time),
+            1114 => Some(Inner::Timestamp),
+            1115 => Some(Inner::TimestampArray),
+            1182 => Some(Inner::DateArray),
+            1183 => Some(Inner::TimeArray),
+            1184 => Some(Inner::Timestamptz),
+            1185 => Some(Inner::TimestamptzArray),
+            1186 => Some(Inner::Interval),
+            1187 => Some(Inner::IntervalArray),
+            1231 => Some(Inner::NumericArray),
+            1263 => Some(Inner::CstringArray),
+            1266 => Some(Inner::Timetz),
+            1270 => Some(Inner::TimetzArray),
+            1560 => Some(Inner::Bit),
+            1561 => Some(Inner::BitArray),
+            1562 => Some(Inner::Varbit),
+            1563 => Some(Inner::VarbitArray),
+            1700 => Some(Inner::Numeric),
+            1790 => Some(Inner::Refcursor),
+            2201 => Some(Inner::RefcursorArray),
+            2202 => Some(Inner::Regprocedure),
+            2203 => Some(Inner::Regoper),
+            2204 => Some(Inner::Regoperator),
+            2205 => Some(Inner::Regclass),
+            2206 => Some(Inner::Regtype),
+            2207 => Some(Inner::RegprocedureArray),
+            2208 => Some(Inner::RegoperArray),
+            2209 => Some(Inner::RegoperatorArray),
+            2210 => Some(Inner::RegclassArray),
+            2211 => Some(Inner::RegtypeArray),
+            2249 => Some(Inner::Record),
+            2275 => Some(Inner::Cstring),
+            2276 => Some(Inner::Any),
+            2277 => Some(Inner::Anyarray),
+            2278 => Some(Inner::Void),
+            2279 => Some(Inner::Trigger),
+            2280 => Some(Inner::LanguageHandler),
+            2281 => Some(Inner::Internal),
+            2283 => Some(Inner::Anyelement),
+            2287 => Some(Inner::RecordArray),
+            2776 => Some(Inner::Anynonarray),
+            2949 => Some(Inner::TxidSnapshotArray),
+            2950 => Some(Inner::Uuid),
+            2951 => Some(Inner::UuidArray),
+            2970 => Some(Inner::TxidSnapshot),
+            3115 => Some(Inner::FdwHandler),
+            3220 => Some(Inner::PgLsn),
+            3221 => Some(Inner::PgLsnArray),
+            3310 => Some(Inner::TsmHandler),
+            3361 => Some(Inner::PgNdistinct),
+            3402 => Some(Inner::PgDependencies),
+            3500 => Some(Inner::Anyenum),
+            3614 => Some(Inner::TsVector),
+            3615 => Some(Inner::Tsquery),
+            3642 => Some(Inner::GtsVector),
+            3643 => Some(Inner::TsVectorArray),
+            3644 => Some(Inner::GtsVectorArray),
+            3645 => Some(Inner::TsqueryArray),
+            3734 => Some(Inner::Regconfig),
+            3735 => Some(Inner::RegconfigArray),
+            3769 => Some(Inner::Regdictionary),
+            3770 => Some(Inner::RegdictionaryArray),
+            3802 => Some(Inner::Jsonb),
+            3807 => Some(Inner::JsonbArray),
+            3831 => Some(Inner::AnyRange),
+            3838 => Some(Inner::EventTrigger),
+            3904 => Some(Inner::Int4Range),
+            3905 => Some(Inner::Int4RangeArray),
+            3906 => Some(Inner::NumRange),
+            3907 => Some(Inner::NumRangeArray),
+            3908 => Some(Inner::TsRange),
+            3909 => Some(Inner::TsRangeArray),
+            3910 => Some(Inner::TstzRange),
+            3911 => Some(Inner::TstzRangeArray),
+            3912 => Some(Inner::DateRange),
+            3913 => Some(Inner::DateRangeArray),
+            3926 => Some(Inner::Int8Range),
+            3927 => Some(Inner::Int8RangeArray),
+            4072 => Some(Inner::Jsonpath),
+            4073 => Some(Inner::JsonpathArray),
+            4089 => Some(Inner::Regnamespace),
+            4090 => Some(Inner::RegnamespaceArray),
+            4096 => Some(Inner::Regrole),
+            4097 => Some(Inner::RegroleArray),
+            4191 => Some(Inner::Regcollation),
+            4192 => Some(Inner::RegcollationArray),
+            4451 => Some(Inner::Int4multiRange),
+            4532 => Some(Inner::NummultiRange),
+            4533 => Some(Inner::TsmultiRange),
+            4534 => Some(Inner::TstzmultiRange),
+            4535 => Some(Inner::DatemultiRange),
+            4536 => Some(Inner::Int8multiRange),
+            4537 => Some(Inner::AnymultiRange),
+            4538 => Some(Inner::AnycompatiblemultiRange),
+            4600 => Some(Inner::PgBrinBloomSummary),
+            4601 => Some(Inner::PgBrinMinmaxMultiSummary),
+            5017 => Some(Inner::PgMcvList),
+            5038 => Some(Inner::PgSnapshot),
+            5039 => Some(Inner::PgSnapshotArray),
+            5069 => Some(Inner::Xid8),
+            5077 => Some(Inner::Anycompatible),
+            5078 => Some(Inner::Anycompatiblearray),
+            5079 => Some(Inner::Anycompatiblenonarray),
+            5080 => Some(Inner::AnycompatibleRange),
+            6150 => Some(Inner::Int4multiRangeArray),
+            6151 => Some(Inner::NummultiRangeArray),
+            6152 => Some(Inner::TsmultiRangeArray),
+            6153 => Some(Inner::TstzmultiRangeArray),
+            6155 => Some(Inner::DatemultiRangeArray),
+            6157 => Some(Inner::Int8multiRangeArray),
+            _ => None,
+        }
+    }
+
+    pub fn oid(&self) -> Oid {
+        match *self {
+            Inner::Bool => 16,
+            Inner::Bytea => 17,
+            Inner::Char => 18,
+            Inner::Name => 19,
+            Inner::Int8 => 20,
+            Inner::Int2 => 21,
+            Inner::Int2Vector => 22,
+            Inner::Int4 => 23,
+            Inner::Regproc => 24,
+            Inner::Text => 25,
+            Inner::Oid => 26,
+            Inner::Tid => 27,
+            Inner::Xid => 28,
+            Inner::Cid => 29,
+            Inner::OidVector => 30,
+            Inner::PgDdlCommand => 32,
+            Inner::Json => 114,
+            Inner::Xml => 142,
+            Inner::XmlArray => 143,
+            Inner::PgNodeTree => 194,
+            Inner::JsonArray => 199,
+            Inner::TableAmHandler => 269,
+            Inner::Xid8Array => 271,
+            Inner::IndexAmHandler => 325,
+            Inner::Point => 600,
+            Inner::Lseg => 601,
+            Inner::Path => 602,
+            Inner::Box => 603,
+            Inner::Polygon => 604,
+            Inner::Line => 628,
+            Inner::LineArray => 629,
+            Inner::Cidr => 650,
+            Inner::CidrArray => 651,
+            Inner::Float4 => 700,
+            Inner::Float8 => 701,
+            Inner::Unknown => 705,
+            Inner::Circle => 718,
+            Inner::CircleArray => 719,
+            Inner::Macaddr8 => 774,
+            Inner::Macaddr8Array => 775,
+            Inner::Money => 790,
+            Inner::MoneyArray => 791,
+            Inner::Macaddr => 829,
+            Inner::Inet => 869,
+            Inner::BoolArray => 1000,
+            Inner::ByteaArray => 1001,
+            Inner::CharArray => 1002,
+            Inner::NameArray => 1003,
+            Inner::Int2Array => 1005,
+            Inner::Int2VectorArray => 1006,
+            Inner::Int4Array => 1007,
+            Inner::RegprocArray => 1008,
+            Inner::TextArray => 1009,
+            Inner::TidArray => 1010,
+            Inner::XidArray => 1011,
+            Inner::CidArray => 1012,
+            Inner::OidVectorArray => 1013,
+            Inner::BpcharArray => 1014,
+            Inner::VarcharArray => 1015,
+            Inner::Int8Array => 1016,
+            Inner::PointArray => 1017,
+            Inner::LsegArray => 1018,
+            Inner::PathArray => 1019,
+            Inner::BoxArray => 1020,
+            Inner::Float4Array => 1021,
+            Inner::Float8Array => 1022,
+            Inner::PolygonArray => 1027,
+            Inner::OidArray => 1028,
+            Inner::Aclitem => 1033,
+            Inner::AclitemArray => 1034,
+            Inner::MacaddrArray => 1040,
+            Inner::InetArray => 1041,
+            Inner::Bpchar => 1042,
+            Inner::Varchar => 1043,
+            Inner::Date => 1082,
+            Inner::Time => 1083,
+            Inner::Timestamp => 1114,
+            Inner::TimestampArray => 1115,
+            Inner::DateArray => 1182,
+            Inner::TimeArray => 1183,
+            Inner::Timestamptz => 1184,
+            Inner::TimestamptzArray => 1185,
+            Inner::Interval => 1186,
+            Inner::IntervalArray => 1187,
+            Inner::NumericArray => 1231,
+            Inner::CstringArray => 1263,
+            Inner::Timetz => 1266,
+            Inner::TimetzArray => 1270,
+            Inner::Bit => 1560,
+            Inner::BitArray => 1561,
+            Inner::Varbit => 1562,
+            Inner::VarbitArray => 1563,
+            Inner::Numeric => 1700,
+            Inner::Refcursor => 1790,
+            Inner::RefcursorArray => 2201,
+            Inner::Regprocedure => 2202,
+            Inner::Regoper => 2203,
+            Inner::Regoperator => 2204,
+            Inner::Regclass => 2205,
+            Inner::Regtype => 2206,
+            Inner::RegprocedureArray => 2207,
+            Inner::RegoperArray => 2208,
+            Inner::RegoperatorArray => 2209,
+            Inner::RegclassArray => 2210,
+            Inner::RegtypeArray => 2211,
+            Inner::Record => 2249,
+            Inner::Cstring => 2275,
+            Inner::Any => 2276,
+            Inner::Anyarray => 2277,
+            Inner::Void => 2278,
+            Inner::Trigger => 2279,
+            Inner::LanguageHandler => 2280,
+            Inner::Internal => 2281,
+            Inner::Anyelement => 2283,
+            Inner::RecordArray => 2287,
+            Inner::Anynonarray => 2776,
+            Inner::TxidSnapshotArray => 2949,
+            Inner::Uuid => 2950,
+            Inner::UuidArray => 2951,
+            Inner::TxidSnapshot => 2970,
+            Inner::FdwHandler => 3115,
+            Inner::PgLsn => 3220,
+            Inner::PgLsnArray => 3221,
+            Inner::TsmHandler => 3310,
+            Inner::PgNdistinct => 3361,
+            Inner::PgDependencies => 3402,
+            Inner::Anyenum => 3500,
+            Inner::TsVector => 3614,
+            Inner::Tsquery => 3615,
+            Inner::GtsVector => 3642,
+            Inner::TsVectorArray => 3643,
+            Inner::GtsVectorArray => 3644,
+            Inner::TsqueryArray => 3645,
+            Inner::Regconfig => 3734,
+            Inner::RegconfigArray => 3735,
+            Inner::Regdictionary => 3769,
+            Inner::RegdictionaryArray => 3770,
+            Inner::Jsonb => 3802,
+            Inner::JsonbArray => 3807,
+            Inner::AnyRange => 3831,
+            Inner::EventTrigger => 3838,
+            Inner::Int4Range => 3904,
+            Inner::Int4RangeArray => 3905,
+            Inner::NumRange => 3906,
+            Inner::NumRangeArray => 3907,
+            Inner::TsRange => 3908,
+            Inner::TsRangeArray => 3909,
+            Inner::TstzRange => 3910,
+            Inner::TstzRangeArray => 3911,
+            Inner::DateRange => 3912,
+            Inner::DateRangeArray => 3913,
+            Inner::Int8Range => 3926,
+            Inner::Int8RangeArray => 3927,
+            Inner::Jsonpath => 4072,
+            Inner::JsonpathArray => 4073,
+            Inner::Regnamespace => 4089,
+            Inner::RegnamespaceArray => 4090,
+            Inner::Regrole => 4096,
+            Inner::RegroleArray => 4097,
+            Inner::Regcollation => 4191,
+            Inner::RegcollationArray => 4192,
+            Inner::Int4multiRange => 4451,
+            Inner::NummultiRange => 4532,
+            Inner::TsmultiRange => 4533,
+            Inner::TstzmultiRange => 4534,
+            Inner::DatemultiRange => 4535,
+            Inner::Int8multiRange => 4536,
+            Inner::AnymultiRange => 4537,
+            Inner::AnycompatiblemultiRange => 4538,
+            Inner::PgBrinBloomSummary => 4600,
+            Inner::PgBrinMinmaxMultiSummary => 4601,
+            Inner::PgMcvList => 5017,
+            Inner::PgSnapshot => 5038,
+            Inner::PgSnapshotArray => 5039,
+            Inner::Xid8 => 5069,
+            Inner::Anycompatible => 5077,
+            Inner::Anycompatiblearray => 5078,
+            Inner::Anycompatiblenonarray => 5079,
+            Inner::AnycompatibleRange => 5080,
+            Inner::Int4multiRangeArray => 6150,
+            Inner::NummultiRangeArray => 6151,
+            Inner::TsmultiRangeArray => 6152,
+            Inner::TstzmultiRangeArray => 6153,
+            Inner::DatemultiRangeArray => 6155,
+            Inner::Int8multiRangeArray => 6157,
+            Inner::Other(ref u) => u.oid,
+        }
+    }
+
+    pub fn kind(&self) -> &Kind {
+        match *self {
+            Inner::Bool => &Kind::Simple,
+            Inner::Bytea => &Kind::Simple,
+            Inner::Char => &Kind::Simple,
+            Inner::Name => &Kind::Simple,
+            Inner::Int8 => &Kind::Simple,
+            Inner::Int2 => &Kind::Simple,
+            Inner::Int2Vector => &Kind::Array(Type(Inner::Int2)),
+            Inner::Int4 => &Kind::Simple,
+            Inner::Regproc => &Kind::Simple,
+            Inner::Text => &Kind::Simple,
+            Inner::Oid => &Kind::Simple,
+            Inner::Tid => &Kind::Simple,
+            Inner::Xid => &Kind::Simple,
+            Inner::Cid => &Kind::Simple,
+            Inner::OidVector => &Kind::Array(Type(Inner::Oid)),
+            Inner::PgDdlCommand => &Kind::Pseudo,
+            Inner::Json => &Kind::Simple,
+            Inner::Xml => &Kind::Simple,
+            Inner::XmlArray => &Kind::Array(Type(Inner::Xml)),
+            Inner::PgNodeTree => &Kind::Simple,
+            Inner::JsonArray => &Kind::Array(Type(Inner::Json)),
+            Inner::TableAmHandler => &Kind::Pseudo,
+            Inner::Xid8Array => &Kind::Array(Type(Inner::Xid8)),
+            Inner::IndexAmHandler => &Kind::Pseudo,
+            Inner::Point => &Kind::Simple,
+            Inner::Lseg => &Kind::Simple,
+            Inner::Path => &Kind::Simple,
+            Inner::Box => &Kind::Simple,
+            Inner::Polygon => &Kind::Simple,
+            Inner::Line => &Kind::Simple,
+            Inner::LineArray => &Kind::Array(Type(Inner::Line)),
+            Inner::Cidr => &Kind::Simple,
+            Inner::CidrArray => &Kind::Array(Type(Inner::Cidr)),
+            Inner::Float4 => &Kind::Simple,
+            Inner::Float8 => &Kind::Simple,
+            Inner::Unknown => &Kind::Simple,
+            Inner::Circle => &Kind::Simple,
+            Inner::CircleArray => &Kind::Array(Type(Inner::Circle)),
+            Inner::Macaddr8 => &Kind::Simple,
+            Inner::Macaddr8Array => &Kind::Array(Type(Inner::Macaddr8)),
+            Inner::Money => &Kind::Simple,
+            Inner::MoneyArray => &Kind::Array(Type(Inner::Money)),
+            Inner::Macaddr => &Kind::Simple,
+            Inner::Inet => &Kind::Simple,
+            Inner::BoolArray => &Kind::Array(Type(Inner::Bool)),
+            Inner::ByteaArray => &Kind::Array(Type(Inner::Bytea)),
+            Inner::CharArray => &Kind::Array(Type(Inner::Char)),
+            Inner::NameArray => &Kind::Array(Type(Inner::Name)),
+            Inner::Int2Array => &Kind::Array(Type(Inner::Int2)),
+            Inner::Int2VectorArray => &Kind::Array(Type(Inner::Int2Vector)),
+            Inner::Int4Array => &Kind::Array(Type(Inner::Int4)),
+            Inner::RegprocArray => &Kind::Array(Type(Inner::Regproc)),
+            Inner::TextArray => &Kind::Array(Type(Inner::Text)),
+            Inner::TidArray => &Kind::Array(Type(Inner::Tid)),
+            Inner::XidArray => &Kind::Array(Type(Inner::Xid)),
+            Inner::CidArray => &Kind::Array(Type(Inner::Cid)),
+            Inner::OidVectorArray => &Kind::Array(Type(Inner::OidVector)),
+            Inner::BpcharArray => &Kind::Array(Type(Inner::Bpchar)),
+            Inner::VarcharArray => &Kind::Array(Type(Inner::Varchar)),
+            Inner::Int8Array => &Kind::Array(Type(Inner::Int8)),
+            Inner::PointArray => &Kind::Array(Type(Inner::Point)),
+            Inner::LsegArray => &Kind::Array(Type(Inner::Lseg)),
+            Inner::PathArray => &Kind::Array(Type(Inner::Path)),
+            Inner::BoxArray => &Kind::Array(Type(Inner::Box)),
+            Inner::Float4Array => &Kind::Array(Type(Inner::Float4)),
+            Inner::Float8Array => &Kind::Array(Type(Inner::Float8)),
+            Inner::PolygonArray => &Kind::Array(Type(Inner::Polygon)),
+            Inner::OidArray => &Kind::Array(Type(Inner::Oid)),
+            Inner::Aclitem => &Kind::Simple,
+            Inner::AclitemArray => &Kind::Array(Type(Inner::Aclitem)),
+            Inner::MacaddrArray => &Kind::Array(Type(Inner::Macaddr)),
+            Inner::InetArray => &Kind::Array(Type(Inner::Inet)),
+            Inner::Bpchar => &Kind::Simple,
+            Inner::Varchar => &Kind::Simple,
+            Inner::Date => &Kind::Simple,
+            Inner::Time => &Kind::Simple,
+            Inner::Timestamp => &Kind::Simple,
+            Inner::TimestampArray => &Kind::Array(Type(Inner::Timestamp)),
+            Inner::DateArray => &Kind::Array(Type(Inner::Date)),
+            Inner::TimeArray => &Kind::Array(Type(Inner::Time)),
+            Inner::Timestamptz => &Kind::Simple,
+            Inner::TimestamptzArray => &Kind::Array(Type(Inner::Timestamptz)),
+            Inner::Interval => &Kind::Simple,
+            Inner::IntervalArray => &Kind::Array(Type(Inner::Interval)),
+            Inner::NumericArray => &Kind::Array(Type(Inner::Numeric)),
+            Inner::CstringArray => &Kind::Array(Type(Inner::Cstring)),
+            Inner::Timetz => &Kind::Simple,
+            Inner::TimetzArray => &Kind::Array(Type(Inner::Timetz)),
+            Inner::Bit => &Kind::Simple,
+            Inner::BitArray => &Kind::Array(Type(Inner::Bit)),
+            Inner::Varbit => &Kind::Simple,
+            Inner::VarbitArray => &Kind::Array(Type(Inner::Varbit)),
+            Inner::Numeric => &Kind::Simple,
+            Inner::Refcursor => &Kind::Simple,
+            Inner::RefcursorArray => &Kind::Array(Type(Inner::Refcursor)),
+            Inner::Regprocedure => &Kind::Simple,
+            Inner::Regoper => &Kind::Simple,
+            Inner::Regoperator => &Kind::Simple,
+            Inner::Regclass => &Kind::Simple,
+            Inner::Regtype => &Kind::Simple,
+            Inner::RegprocedureArray => &Kind::Array(Type(Inner::Regprocedure)),
+            Inner::RegoperArray => &Kind::Array(Type(Inner::Regoper)),
+            Inner::RegoperatorArray => &Kind::Array(Type(Inner::Regoperator)),
+            Inner::RegclassArray => &Kind::Array(Type(Inner::Regclass)),
+            Inner::RegtypeArray => &Kind::Array(Type(Inner::Regtype)),
+            Inner::Record => &Kind::Pseudo,
+            Inner::Cstring => &Kind::Pseudo,
+            Inner::Any => &Kind::Pseudo,
+            Inner::Anyarray => &Kind::Pseudo,
+            Inner::Void => &Kind::Pseudo,
+            Inner::Trigger => &Kind::Pseudo,
+            Inner::LanguageHandler => &Kind::Pseudo,
+            Inner::Internal => &Kind::Pseudo,
+            Inner::Anyelement => &Kind::Pseudo,
+            Inner::RecordArray => &Kind::Pseudo,
+            Inner::Anynonarray => &Kind::Pseudo,
+            Inner::TxidSnapshotArray => &Kind::Array(Type(Inner::TxidSnapshot)),
+            Inner::Uuid => &Kind::Simple,
+            Inner::UuidArray => &Kind::Array(Type(Inner::Uuid)),
+            Inner::TxidSnapshot => &Kind::Simple,
+            Inner::FdwHandler => &Kind::Pseudo,
+            Inner::PgLsn => &Kind::Simple,
+            Inner::PgLsnArray => &Kind::Array(Type(Inner::PgLsn)),
+            Inner::TsmHandler => &Kind::Pseudo,
+            Inner::PgNdistinct => &Kind::Simple,
+            Inner::PgDependencies => &Kind::Simple,
+            Inner::Anyenum => &Kind::Pseudo,
+            Inner::TsVector => &Kind::Simple,
+            Inner::Tsquery => &Kind::Simple,
+            Inner::GtsVector => &Kind::Simple,
+            Inner::TsVectorArray => &Kind::Array(Type(Inner::TsVector)),
+            Inner::GtsVectorArray => &Kind::Array(Type(Inner::GtsVector)),
+            Inner::TsqueryArray => &Kind::Array(Type(Inner::Tsquery)),
+            Inner::Regconfig => &Kind::Simple,
+            Inner::RegconfigArray => &Kind::Array(Type(Inner::Regconfig)),
+            Inner::Regdictionary => &Kind::Simple,
+            Inner::RegdictionaryArray => &Kind::Array(Type(Inner::Regdictionary)),
+            Inner::Jsonb => &Kind::Simple,
+            Inner::JsonbArray => &Kind::Array(Type(Inner::Jsonb)),
+            Inner::AnyRange => &Kind::Pseudo,
+            Inner::EventTrigger => &Kind::Pseudo,
+            Inner::Int4Range => &Kind::Range(Type(Inner::Int4)),
+            Inner::Int4RangeArray => &Kind::Array(Type(Inner::Int4Range)),
+            Inner::NumRange => &Kind::Range(Type(Inner::Numeric)),
+            Inner::NumRangeArray => &Kind::Array(Type(Inner::NumRange)),
+            Inner::TsRange => &Kind::Range(Type(Inner::Timestamp)),
+            Inner::TsRangeArray => &Kind::Array(Type(Inner::TsRange)),
+            Inner::TstzRange => &Kind::Range(Type(Inner::Timestamptz)),
+            Inner::TstzRangeArray => &Kind::Array(Type(Inner::TstzRange)),
+            Inner::DateRange => &Kind::Range(Type(Inner::Date)),
+            Inner::DateRangeArray => &Kind::Array(Type(Inner::DateRange)),
+            Inner::Int8Range => &Kind::Range(Type(Inner::Int8)),
+            Inner::Int8RangeArray => &Kind::Array(Type(Inner::Int8Range)),
+            Inner::Jsonpath => &Kind::Simple,
+            Inner::JsonpathArray => &Kind::Array(Type(Inner::Jsonpath)),
+            Inner::Regnamespace => &Kind::Simple,
+            Inner::RegnamespaceArray => &Kind::Array(Type(Inner::Regnamespace)),
+            Inner::Regrole => &Kind::Simple,
+            Inner::RegroleArray => &Kind::Array(Type(Inner::Regrole)),
+            Inner::Regcollation => &Kind::Simple,
+            Inner::RegcollationArray => &Kind::Array(Type(Inner::Regcollation)),
+            Inner::Int4multiRange => &Kind::Multirange(Type(Inner::Int4)),
+            Inner::NummultiRange => &Kind::Multirange(Type(Inner::Numeric)),
+            Inner::TsmultiRange => &Kind::Multirange(Type(Inner::Timestamp)),
+            Inner::TstzmultiRange => &Kind::Multirange(Type(Inner::Timestamptz)),
+            Inner::DatemultiRange => &Kind::Multirange(Type(Inner::Date)),
+            Inner::Int8multiRange => &Kind::Multirange(Type(Inner::Int8)),
+            Inner::AnymultiRange => &Kind::Pseudo,
+            Inner::AnycompatiblemultiRange => &Kind::Pseudo,
+            Inner::PgBrinBloomSummary => &Kind::Simple,
+            Inner::PgBrinMinmaxMultiSummary => &Kind::Simple,
+            Inner::PgMcvList => &Kind::Simple,
+            Inner::PgSnapshot => &Kind::Simple,
+            Inner::PgSnapshotArray => &Kind::Array(Type(Inner::PgSnapshot)),
+            Inner::Xid8 => &Kind::Simple,
+            Inner::Anycompatible => &Kind::Pseudo,
+            Inner::Anycompatiblearray => &Kind::Pseudo,
+            Inner::Anycompatiblenonarray => &Kind::Pseudo,
+            Inner::AnycompatibleRange => &Kind::Pseudo,
+            Inner::Int4multiRangeArray => &Kind::Array(Type(Inner::Int4multiRange)),
+            Inner::NummultiRangeArray => &Kind::Array(Type(Inner::NummultiRange)),
+            Inner::TsmultiRangeArray => &Kind::Array(Type(Inner::TsmultiRange)),
+            Inner::TstzmultiRangeArray => &Kind::Array(Type(Inner::TstzmultiRange)),
+            Inner::DatemultiRangeArray => &Kind::Array(Type(Inner::DatemultiRange)),
+            Inner::Int8multiRangeArray => &Kind::Array(Type(Inner::Int8multiRange)),
+            Inner::Other(ref u) => &u.kind,
+        }
+    }
+
+    pub fn name(&self) -> &str {
+        match *self {
+            Inner::Bool => "bool",
+            Inner::Bytea => "bytea",
+            Inner::Char => "char",
+            Inner::Name => "name",
+            Inner::Int8 => "int8",
+            Inner::Int2 => "int2",
+            Inner::Int2Vector => "int2vector",
+            Inner::Int4 => "int4",
+            Inner::Regproc => "regproc",
+            Inner::Text => "text",
+            Inner::Oid => "oid",
+            Inner::Tid => "tid",
+            Inner::Xid => "xid",
+            Inner::Cid => "cid",
+            Inner::OidVector => "oidvector",
+            Inner::PgDdlCommand => "pg_ddl_command",
+            Inner::Json => "json",
+            Inner::Xml => "xml",
+            Inner::XmlArray => "_xml",
+            Inner::PgNodeTree => "pg_node_tree",
+            Inner::JsonArray => "_json",
+            Inner::TableAmHandler => "table_am_handler",
+            Inner::Xid8Array => "_xid8",
+            Inner::IndexAmHandler => "index_am_handler",
+            Inner::Point => "point",
+            Inner::Lseg => "lseg",
+            Inner::Path => "path",
+            Inner::Box => "box",
+            Inner::Polygon => "polygon",
+            Inner::Line => "line",
+            Inner::LineArray => "_line",
+            Inner::Cidr => "cidr",
+            Inner::CidrArray => "_cidr",
+            Inner::Float4 => "float4",
+            Inner::Float8 => "float8",
+            Inner::Unknown => "unknown",
+            Inner::Circle => "circle",
+            Inner::CircleArray => "_circle",
+            Inner::Macaddr8 => "macaddr8",
+            Inner::Macaddr8Array => "_macaddr8",
+            Inner::Money => "money",
+            Inner::MoneyArray => "_money",
+            Inner::Macaddr => "macaddr",
+            Inner::Inet => "inet",
+            Inner::BoolArray => "_bool",
+            Inner::ByteaArray => "_bytea",
+            Inner::CharArray => "_char",
+            Inner::NameArray => "_name",
+            Inner::Int2Array => "_int2",
+            Inner::Int2VectorArray => "_int2vector",
+            Inner::Int4Array => "_int4",
+            Inner::RegprocArray => "_regproc",
+            Inner::TextArray => "_text",
+            Inner::TidArray => "_tid",
+            Inner::XidArray => "_xid",
+            Inner::CidArray => "_cid",
+            Inner::OidVectorArray => "_oidvector",
+            Inner::BpcharArray => "_bpchar",
+            Inner::VarcharArray => "_varchar",
+            Inner::Int8Array => "_int8",
+            Inner::PointArray => "_point",
+            Inner::LsegArray => "_lseg",
+            Inner::PathArray => "_path",
+            Inner::BoxArray => "_box",
+            Inner::Float4Array => "_float4",
+            Inner::Float8Array => "_float8",
+            Inner::PolygonArray => "_polygon",
+            Inner::OidArray => "_oid",
+            Inner::Aclitem => "aclitem",
+            Inner::AclitemArray => "_aclitem",
+            Inner::MacaddrArray => "_macaddr",
+            Inner::InetArray => "_inet",
+            Inner::Bpchar => "bpchar",
+            Inner::Varchar => "varchar",
+            Inner::Date => "date",
+            Inner::Time => "time",
+            Inner::Timestamp => "timestamp",
+            Inner::TimestampArray => "_timestamp",
+            Inner::DateArray => "_date",
+            Inner::TimeArray => "_time",
+            Inner::Timestamptz => "timestamptz",
+            Inner::TimestamptzArray => "_timestamptz",
+            Inner::Interval => "interval",
+            Inner::IntervalArray => "_interval",
+            Inner::NumericArray => "_numeric",
+            Inner::CstringArray => "_cstring",
+            Inner::Timetz => "timetz",
+            Inner::TimetzArray => "_timetz",
+            Inner::Bit => "bit",
+            Inner::BitArray => "_bit",
+            Inner::Varbit => "varbit",
+            Inner::VarbitArray => "_varbit",
+            Inner::Numeric => "numeric",
+            Inner::Refcursor => "refcursor",
+            Inner::RefcursorArray => "_refcursor",
+            Inner::Regprocedure => "regprocedure",
+            Inner::Regoper => "regoper",
+            Inner::Regoperator => "regoperator",
+            Inner::Regclass => "regclass",
+            Inner::Regtype => "regtype",
+            Inner::RegprocedureArray => "_regprocedure",
+            Inner::RegoperArray => "_regoper",
+            Inner::RegoperatorArray => "_regoperator",
+            Inner::RegclassArray => "_regclass",
+            Inner::RegtypeArray => "_regtype",
+            Inner::Record => "record",
+            Inner::Cstring => "cstring",
+            Inner::Any => "any",
+            Inner::Anyarray => "anyarray",
+            Inner::Void => "void",
+            Inner::Trigger => "trigger",
+            Inner::LanguageHandler => "language_handler",
+            Inner::Internal => "internal",
+            Inner::Anyelement => "anyelement",
+            Inner::RecordArray => "_record",
+            Inner::Anynonarray => "anynonarray",
+            Inner::TxidSnapshotArray => "_txid_snapshot",
+            Inner::Uuid => "uuid",
+            Inner::UuidArray => "_uuid",
+            Inner::TxidSnapshot => "txid_snapshot",
+            Inner::FdwHandler => "fdw_handler",
+            Inner::PgLsn => "pg_lsn",
+            Inner::PgLsnArray => "_pg_lsn",
+            Inner::TsmHandler => "tsm_handler",
+            Inner::PgNdistinct => "pg_ndistinct",
+            Inner::PgDependencies => "pg_dependencies",
+            Inner::Anyenum => "anyenum",
+            Inner::TsVector => "tsvector",
+            Inner::Tsquery => "tsquery",
+            Inner::GtsVector => "gtsvector",
+            Inner::TsVectorArray => "_tsvector",
+            Inner::GtsVectorArray => "_gtsvector",
+            Inner::TsqueryArray => "_tsquery",
+            Inner::Regconfig => "regconfig",
+            Inner::RegconfigArray => "_regconfig",
+            Inner::Regdictionary => "regdictionary",
+            Inner::RegdictionaryArray => "_regdictionary",
+            Inner::Jsonb => "jsonb",
+            Inner::JsonbArray => "_jsonb",
+            Inner::AnyRange => "anyrange",
+            Inner::EventTrigger => "event_trigger",
+            Inner::Int4Range => "int4range",
+            Inner::Int4RangeArray => "_int4range",
+            Inner::NumRange => "numrange",
+            Inner::NumRangeArray => "_numrange",
+            Inner::TsRange => "tsrange",
+            Inner::TsRangeArray => "_tsrange",
+            Inner::TstzRange => "tstzrange",
+            Inner::TstzRangeArray => "_tstzrange",
+            Inner::DateRange => "daterange",
+            Inner::DateRangeArray => "_daterange",
+            Inner::Int8Range => "int8range",
+            Inner::Int8RangeArray => "_int8range",
+            Inner::Jsonpath => "jsonpath",
+            Inner::JsonpathArray => "_jsonpath",
+            Inner::Regnamespace => "regnamespace",
+            Inner::RegnamespaceArray => "_regnamespace",
+            Inner::Regrole => "regrole",
+            Inner::RegroleArray => "_regrole",
+            Inner::Regcollation => "regcollation",
+            Inner::RegcollationArray => "_regcollation",
+            Inner::Int4multiRange => "int4multirange",
+            Inner::NummultiRange => "nummultirange",
+            Inner::TsmultiRange => "tsmultirange",
+            Inner::TstzmultiRange => "tstzmultirange",
+            Inner::DatemultiRange => "datemultirange",
+            Inner::Int8multiRange => "int8multirange",
+            Inner::AnymultiRange => "anymultirange",
+            Inner::AnycompatiblemultiRange => "anycompatiblemultirange",
+            Inner::PgBrinBloomSummary => "pg_brin_bloom_summary",
+            Inner::PgBrinMinmaxMultiSummary => "pg_brin_minmax_multi_summary",
+            Inner::PgMcvList => "pg_mcv_list",
+            Inner::PgSnapshot => "pg_snapshot",
+            Inner::PgSnapshotArray => "_pg_snapshot",
+            Inner::Xid8 => "xid8",
+            Inner::Anycompatible => "anycompatible",
+            Inner::Anycompatiblearray => "anycompatiblearray",
+            Inner::Anycompatiblenonarray => "anycompatiblenonarray",
+            Inner::AnycompatibleRange => "anycompatiblerange",
+            Inner::Int4multiRangeArray => "_int4multirange",
+            Inner::NummultiRangeArray => "_nummultirange",
+            Inner::TsmultiRangeArray => "_tsmultirange",
+            Inner::TstzmultiRangeArray => "_tstzmultirange",
+            Inner::DatemultiRangeArray => "_datemultirange",
+            Inner::Int8multiRangeArray => "_int8multirange",
+            Inner::Other(ref u) => &u.name,
+        }
+    }
+}
+impl Type {
+    /// BOOL - boolean, &#39;true&#39;/&#39;false&#39;
+    pub const BOOL: Type = Type(Inner::Bool);
+
+    /// BYTEA - variable-length string, binary values escaped
+    pub const BYTEA: Type = Type(Inner::Bytea);
+
+    /// CHAR - single character
+    pub const CHAR: Type = Type(Inner::Char);
+
+    /// NAME - 63-byte type for storing system identifiers
+    pub const NAME: Type = Type(Inner::Name);
+
+    /// INT8 - ~18 digit integer, 8-byte storage
+    pub const INT8: Type = Type(Inner::Int8);
+
+    /// INT2 - -32 thousand to 32 thousand, 2-byte storage
+    pub const INT2: Type = Type(Inner::Int2);
+
+    /// INT2VECTOR - array of int2, used in system tables
+    pub const INT2_VECTOR: Type = Type(Inner::Int2Vector);
+
+    /// INT4 - -2 billion to 2 billion integer, 4-byte storage
+    pub const INT4: Type = Type(Inner::Int4);
+
+    /// REGPROC - registered procedure
+    pub const REGPROC: Type = Type(Inner::Regproc);
+
+    /// TEXT - variable-length string, no limit specified
+    pub const TEXT: Type = Type(Inner::Text);
+
+    /// OID - object identifier&#40;oid&#41;, maximum 4 billion
+    pub const OID: Type = Type(Inner::Oid);
+
+    /// TID - &#40;block, offset&#41;, physical location of tuple
+    pub const TID: Type = Type(Inner::Tid);
+
+    /// XID - transaction id
+    pub const XID: Type = Type(Inner::Xid);
+
+    /// CID - command identifier type, sequence in transaction id
+    pub const CID: Type = Type(Inner::Cid);
+
+    /// OIDVECTOR - array of oids, used in system tables
+    pub const OID_VECTOR: Type = Type(Inner::OidVector);
+
+    /// PG_DDL_COMMAND - internal type for passing CollectedCommand
+    pub const PG_DDL_COMMAND: Type = Type(Inner::PgDdlCommand);
+
+    /// JSON - JSON stored as text
+    pub const JSON: Type = Type(Inner::Json);
+
+    /// XML - XML content
+    pub const XML: Type = Type(Inner::Xml);
+
+    /// XML&#91;&#93;
+    pub const XML_ARRAY: Type = Type(Inner::XmlArray);
+
+    /// PG_NODE_TREE - string representing an internal node tree
+    pub const PG_NODE_TREE: Type = Type(Inner::PgNodeTree);
+
+    /// JSON&#91;&#93;
+    pub const JSON_ARRAY: Type = Type(Inner::JsonArray);
+
+    /// TABLE_AM_HANDLER
+    pub const TABLE_AM_HANDLER: Type = Type(Inner::TableAmHandler);
+
+    /// XID8&#91;&#93;
+    pub const XID8_ARRAY: Type = Type(Inner::Xid8Array);
+
+    /// INDEX_AM_HANDLER - pseudo-type for the result of an index AM handler function
+    pub const INDEX_AM_HANDLER: Type = Type(Inner::IndexAmHandler);
+
+    /// POINT - geometric point &#39;&#40;x, y&#41;&#39;
+    pub const POINT: Type = Type(Inner::Point);
+
+    /// LSEG - geometric line segment &#39;&#40;pt1,pt2&#41;&#39;
+    pub const LSEG: Type = Type(Inner::Lseg);
+
+    /// PATH - geometric path &#39;&#40;pt1,...&#41;&#39;
+    pub const PATH: Type = Type(Inner::Path);
+
+    /// BOX - geometric box &#39;&#40;lower left,upper right&#41;&#39;
+    pub const BOX: Type = Type(Inner::Box);
+
+    /// POLYGON - geometric polygon &#39;&#40;pt1,...&#41;&#39;
+    pub const POLYGON: Type = Type(Inner::Polygon);
+
+    /// LINE - geometric line
+    pub const LINE: Type = Type(Inner::Line);
+
+    /// LINE&#91;&#93;
+    pub const LINE_ARRAY: Type = Type(Inner::LineArray);
+
+    /// CIDR - network IP address/netmask, network address
+    pub const CIDR: Type = Type(Inner::Cidr);
+
+    /// CIDR&#91;&#93;
+    pub const CIDR_ARRAY: Type = Type(Inner::CidrArray);
+
+    /// FLOAT4 - single-precision floating point number, 4-byte storage
+    pub const FLOAT4: Type = Type(Inner::Float4);
+
+    /// FLOAT8 - double-precision floating point number, 8-byte storage
+    pub const FLOAT8: Type = Type(Inner::Float8);
+
+    /// UNKNOWN - pseudo-type representing an undetermined type
+    pub const UNKNOWN: Type = Type(Inner::Unknown);
+
+    /// CIRCLE - geometric circle &#39;&#40;center,radius&#41;&#39;
+    pub const CIRCLE: Type = Type(Inner::Circle);
+
+    /// CIRCLE&#91;&#93;
+    pub const CIRCLE_ARRAY: Type = Type(Inner::CircleArray);
+
+    /// MACADDR8 - XX:XX:XX:XX:XX:XX:XX:XX, MAC address
+    pub const MACADDR8: Type = Type(Inner::Macaddr8);
+
+    /// MACADDR8&#91;&#93;
+    pub const MACADDR8_ARRAY: Type = Type(Inner::Macaddr8Array);
+
+    /// MONEY - monetary amounts, &#36;d,ddd.cc
+    pub const MONEY: Type = Type(Inner::Money);
+
+    /// MONEY&#91;&#93;
+    pub const MONEY_ARRAY: Type = Type(Inner::MoneyArray);
+
+    /// MACADDR - XX:XX:XX:XX:XX:XX, MAC address
+    pub const MACADDR: Type = Type(Inner::Macaddr);
+
+    /// INET - IP address/netmask, host address, netmask optional
+    pub const INET: Type = Type(Inner::Inet);
+
+    /// BOOL&#91;&#93;
+    pub const BOOL_ARRAY: Type = Type(Inner::BoolArray);
+
+    /// BYTEA&#91;&#93;
+    pub const BYTEA_ARRAY: Type = Type(Inner::ByteaArray);
+
+    /// CHAR&#91;&#93;
+    pub const CHAR_ARRAY: Type = Type(Inner::CharArray);
+
+    /// NAME&#91;&#93;
+    pub const NAME_ARRAY: Type = Type(Inner::NameArray);
+
+    /// INT2&#91;&#93;
+    pub const INT2_ARRAY: Type = Type(Inner::Int2Array);
+
+    /// INT2VECTOR&#91;&#93;
+    pub const INT2_VECTOR_ARRAY: Type = Type(Inner::Int2VectorArray);
+
+    /// INT4&#91;&#93;
+    pub const INT4_ARRAY: Type = Type(Inner::Int4Array);
+
+    /// REGPROC&#91;&#93;
+    pub const REGPROC_ARRAY: Type = Type(Inner::RegprocArray);
+
+    /// TEXT&#91;&#93;
+    pub const TEXT_ARRAY: Type = Type(Inner::TextArray);
+
+    /// TID&#91;&#93;
+    pub const TID_ARRAY: Type = Type(Inner::TidArray);
+
+    /// XID&#91;&#93;
+    pub const XID_ARRAY: Type = Type(Inner::XidArray);
+
+    /// CID&#91;&#93;
+    pub const CID_ARRAY: Type = Type(Inner::CidArray);
+
+    /// OIDVECTOR&#91;&#93;
+    pub const OID_VECTOR_ARRAY: Type = Type(Inner::OidVectorArray);
+
+    /// BPCHAR&#91;&#93;
+    pub const BPCHAR_ARRAY: Type = Type(Inner::BpcharArray);
+
+    /// VARCHAR&#91;&#93;
+    pub const VARCHAR_ARRAY: Type = Type(Inner::VarcharArray);
+
+    /// INT8&#91;&#93;
+    pub const INT8_ARRAY: Type = Type(Inner::Int8Array);
+
+    /// POINT&#91;&#93;
+    pub const POINT_ARRAY: Type = Type(Inner::PointArray);
+
+    /// LSEG&#91;&#93;
+    pub const LSEG_ARRAY: Type = Type(Inner::LsegArray);
+
+    /// PATH&#91;&#93;
+    pub const PATH_ARRAY: Type = Type(Inner::PathArray);
+
+    /// BOX&#91;&#93;
+    pub const BOX_ARRAY: Type = Type(Inner::BoxArray);
+
+    /// FLOAT4&#91;&#93;
+    pub const FLOAT4_ARRAY: Type = Type(Inner::Float4Array);
+
+    /// FLOAT8&#91;&#93;
+    pub const FLOAT8_ARRAY: Type = Type(Inner::Float8Array);
+
+    /// POLYGON&#91;&#93;
+    pub const POLYGON_ARRAY: Type = Type(Inner::PolygonArray);
+
+    /// OID&#91;&#93;
+    pub const OID_ARRAY: Type = Type(Inner::OidArray);
+
+    /// ACLITEM - access control list
+    pub const ACLITEM: Type = Type(Inner::Aclitem);
+
+    /// ACLITEM&#91;&#93;
+    pub const ACLITEM_ARRAY: Type = Type(Inner::AclitemArray);
+
+    /// MACADDR&#91;&#93;
+    pub const MACADDR_ARRAY: Type = Type(Inner::MacaddrArray);
+
+    /// INET&#91;&#93;
+    pub const INET_ARRAY: Type = Type(Inner::InetArray);
+
+    /// BPCHAR - char&#40;length&#41;, blank-padded string, fixed storage length
+    pub const BPCHAR: Type = Type(Inner::Bpchar);
+
+    /// VARCHAR - varchar&#40;length&#41;, non-blank-padded string, variable storage length
+    pub const VARCHAR: Type = Type(Inner::Varchar);
+
+    /// DATE - date
+    pub const DATE: Type = Type(Inner::Date);
+
+    /// TIME - time of day
+    pub const TIME: Type = Type(Inner::Time);
+
+    /// TIMESTAMP - date and time
+    pub const TIMESTAMP: Type = Type(Inner::Timestamp);
+
+    /// TIMESTAMP&#91;&#93;
+    pub const TIMESTAMP_ARRAY: Type = Type(Inner::TimestampArray);
+
+    /// DATE&#91;&#93;
+    pub const DATE_ARRAY: Type = Type(Inner::DateArray);
+
+    /// TIME&#91;&#93;
+    pub const TIME_ARRAY: Type = Type(Inner::TimeArray);
+
+    /// TIMESTAMPTZ - date and time with time zone
+    pub const TIMESTAMPTZ: Type = Type(Inner::Timestamptz);
+
+    /// TIMESTAMPTZ&#91;&#93;
+    pub const TIMESTAMPTZ_ARRAY: Type = Type(Inner::TimestamptzArray);
+
+    /// INTERVAL - &#64; &lt;number&gt; &lt;units&gt;, time interval
+    pub const INTERVAL: Type = Type(Inner::Interval);
+
+    /// INTERVAL&#91;&#93;
+    pub const INTERVAL_ARRAY: Type = Type(Inner::IntervalArray);
+
+    /// NUMERIC&#91;&#93;
+    pub const NUMERIC_ARRAY: Type = Type(Inner::NumericArray);
+
+    /// CSTRING&#91;&#93;
+    pub const CSTRING_ARRAY: Type = Type(Inner::CstringArray);
+
+    /// TIMETZ - time of day with time zone
+    pub const TIMETZ: Type = Type(Inner::Timetz);
+
+    /// TIMETZ&#91;&#93;
+    pub const TIMETZ_ARRAY: Type = Type(Inner::TimetzArray);
+
+    /// BIT - fixed-length bit string
+    pub const BIT: Type = Type(Inner::Bit);
+
+    /// BIT&#91;&#93;
+    pub const BIT_ARRAY: Type = Type(Inner::BitArray);
+
+    /// VARBIT - variable-length bit string
+    pub const VARBIT: Type = Type(Inner::Varbit);
+
+    /// VARBIT&#91;&#93;
+    pub const VARBIT_ARRAY: Type = Type(Inner::VarbitArray);
+
+    /// NUMERIC - numeric&#40;precision, decimal&#41;, arbitrary precision number
+    pub const NUMERIC: Type = Type(Inner::Numeric);
+
+    /// REFCURSOR - reference to cursor &#40;portal name&#41;
+    pub const REFCURSOR: Type = Type(Inner::Refcursor);
+
+    /// REFCURSOR&#91;&#93;
+    pub const REFCURSOR_ARRAY: Type = Type(Inner::RefcursorArray);
+
+    /// REGPROCEDURE - registered procedure &#40;with args&#41;
+    pub const REGPROCEDURE: Type = Type(Inner::Regprocedure);
+
+    /// REGOPER - registered operator
+    pub const REGOPER: Type = Type(Inner::Regoper);
+
+    /// REGOPERATOR - registered operator &#40;with args&#41;
+    pub const REGOPERATOR: Type = Type(Inner::Regoperator);
+
+    /// REGCLASS - registered class
+    pub const REGCLASS: Type = Type(Inner::Regclass);
+
+    /// REGTYPE - registered type
+    pub const REGTYPE: Type = Type(Inner::Regtype);
+
+    /// REGPROCEDURE&#91;&#93;
+    pub const REGPROCEDURE_ARRAY: Type = Type(Inner::RegprocedureArray);
+
+    /// REGOPER&#91;&#93;
+    pub const REGOPER_ARRAY: Type = Type(Inner::RegoperArray);
+
+    /// REGOPERATOR&#91;&#93;
+    pub const REGOPERATOR_ARRAY: Type = Type(Inner::RegoperatorArray);
+
+    /// REGCLASS&#91;&#93;
+    pub const REGCLASS_ARRAY: Type = Type(Inner::RegclassArray);
+
+    /// REGTYPE&#91;&#93;
+    pub const REGTYPE_ARRAY: Type = Type(Inner::RegtypeArray);
+
+    /// RECORD - pseudo-type representing any composite type
+    pub const RECORD: Type = Type(Inner::Record);
+
+    /// CSTRING - C-style string
+    pub const CSTRING: Type = Type(Inner::Cstring);
+
+    /// ANY - pseudo-type representing any type
+    pub const ANY: Type = Type(Inner::Any);
+
+    /// ANYARRAY - pseudo-type representing a polymorphic array type
+    pub const ANYARRAY: Type = Type(Inner::Anyarray);
+
+    /// VOID - pseudo-type for the result of a function with no real result
+    pub const VOID: Type = Type(Inner::Void);
+
+    /// TRIGGER - pseudo-type for the result of a trigger function
+    pub const TRIGGER: Type = Type(Inner::Trigger);
+
+    /// LANGUAGE_HANDLER - pseudo-type for the result of a language handler function
+    pub const LANGUAGE_HANDLER: Type = Type(Inner::LanguageHandler);
+
+    /// INTERNAL - pseudo-type representing an internal data structure
+    pub const INTERNAL: Type = Type(Inner::Internal);
+
+    /// ANYELEMENT - pseudo-type representing a polymorphic base type
+    pub const ANYELEMENT: Type = Type(Inner::Anyelement);
+
+    /// RECORD&#91;&#93;
+    pub const RECORD_ARRAY: Type = Type(Inner::RecordArray);
+
+    /// ANYNONARRAY - pseudo-type representing a polymorphic base type that is not an array
+    pub const ANYNONARRAY: Type = Type(Inner::Anynonarray);
+
+    /// TXID_SNAPSHOT&#91;&#93;
+    pub const TXID_SNAPSHOT_ARRAY: Type = Type(Inner::TxidSnapshotArray);
+
+    /// UUID - UUID datatype
+    pub const UUID: Type = Type(Inner::Uuid);
+
+    /// UUID&#91;&#93;
+    pub const UUID_ARRAY: Type = Type(Inner::UuidArray);
+
+    /// TXID_SNAPSHOT - txid snapshot
+    pub const TXID_SNAPSHOT: Type = Type(Inner::TxidSnapshot);
+
+    /// FDW_HANDLER - pseudo-type for the result of an FDW handler function
+    pub const FDW_HANDLER: Type = Type(Inner::FdwHandler);
+
+    /// PG_LSN - PostgreSQL LSN datatype
+    pub const PG_LSN: Type = Type(Inner::PgLsn);
+
+    /// PG_LSN&#91;&#93;
+    pub const PG_LSN_ARRAY: Type = Type(Inner::PgLsnArray);
+
+    /// TSM_HANDLER - pseudo-type for the result of a tablesample method function
+    pub const TSM_HANDLER: Type = Type(Inner::TsmHandler);
+
+    /// PG_NDISTINCT - multivariate ndistinct coefficients
+    pub const PG_NDISTINCT: Type = Type(Inner::PgNdistinct);
+
+    /// PG_DEPENDENCIES - multivariate dependencies
+    pub const PG_DEPENDENCIES: Type = Type(Inner::PgDependencies);
+
+    /// ANYENUM - pseudo-type representing a polymorphic base type that is an enum
+    pub const ANYENUM: Type = Type(Inner::Anyenum);
+
+    /// TSVECTOR - text representation for text search
+    pub const TS_VECTOR: Type = Type(Inner::TsVector);
+
+    /// TSQUERY - query representation for text search
+    pub const TSQUERY: Type = Type(Inner::Tsquery);
+
+    /// GTSVECTOR - GiST index internal text representation for text search
+    pub const GTS_VECTOR: Type = Type(Inner::GtsVector);
+
+    /// TSVECTOR&#91;&#93;
+    pub const TS_VECTOR_ARRAY: Type = Type(Inner::TsVectorArray);
+
+    /// GTSVECTOR&#91;&#93;
+    pub const GTS_VECTOR_ARRAY: Type = Type(Inner::GtsVectorArray);
+
+    /// TSQUERY&#91;&#93;
+    pub const TSQUERY_ARRAY: Type = Type(Inner::TsqueryArray);
+
+    /// REGCONFIG - registered text search configuration
+    pub const REGCONFIG: Type = Type(Inner::Regconfig);
+
+    /// REGCONFIG&#91;&#93;
+    pub const REGCONFIG_ARRAY: Type = Type(Inner::RegconfigArray);
+
+    /// REGDICTIONARY - registered text search dictionary
+    pub const REGDICTIONARY: Type = Type(Inner::Regdictionary);
+
+    /// REGDICTIONARY&#91;&#93;
+    pub const REGDICTIONARY_ARRAY: Type = Type(Inner::RegdictionaryArray);
+
+    /// JSONB - Binary JSON
+    pub const JSONB: Type = Type(Inner::Jsonb);
+
+    /// JSONB&#91;&#93;
+    pub const JSONB_ARRAY: Type = Type(Inner::JsonbArray);
+
+    /// ANYRANGE - pseudo-type representing a range over a polymorphic base type
+    pub const ANY_RANGE: Type = Type(Inner::AnyRange);
+
+    /// EVENT_TRIGGER - pseudo-type for the result of an event trigger function
+    pub const EVENT_TRIGGER: Type = Type(Inner::EventTrigger);
+
+    /// INT4RANGE - range of integers
+    pub const INT4_RANGE: Type = Type(Inner::Int4Range);
+
+    /// INT4RANGE&#91;&#93;
+    pub const INT4_RANGE_ARRAY: Type = Type(Inner::Int4RangeArray);
+
+    /// NUMRANGE - range of numerics
+    pub const NUM_RANGE: Type = Type(Inner::NumRange);
+
+    /// NUMRANGE&#91;&#93;
+    pub const NUM_RANGE_ARRAY: Type = Type(Inner::NumRangeArray);
+
+    /// TSRANGE - range of timestamps without time zone
+    pub const TS_RANGE: Type = Type(Inner::TsRange);
+
+    /// TSRANGE&#91;&#93;
+    pub const TS_RANGE_ARRAY: Type = Type(Inner::TsRangeArray);
+
+    /// TSTZRANGE - range of timestamps with time zone
+    pub const TSTZ_RANGE: Type = Type(Inner::TstzRange);
+
+    /// TSTZRANGE&#91;&#93;
+    pub const TSTZ_RANGE_ARRAY: Type = Type(Inner::TstzRangeArray);
+
+    /// DATERANGE - range of dates
+    pub const DATE_RANGE: Type = Type(Inner::DateRange);
+
+    /// DATERANGE&#91;&#93;
+    pub const DATE_RANGE_ARRAY: Type = Type(Inner::DateRangeArray);
+
+    /// INT8RANGE - range of bigints
+    pub const INT8_RANGE: Type = Type(Inner::Int8Range);
+
+    /// INT8RANGE&#91;&#93;
+    pub const INT8_RANGE_ARRAY: Type = Type(Inner::Int8RangeArray);
+
+    /// JSONPATH - JSON path
+    pub const JSONPATH: Type = Type(Inner::Jsonpath);
+
+    /// JSONPATH&#91;&#93;
+    pub const JSONPATH_ARRAY: Type = Type(Inner::JsonpathArray);
+
+    /// REGNAMESPACE - registered namespace
+    pub const REGNAMESPACE: Type = Type(Inner::Regnamespace);
+
+    /// REGNAMESPACE&#91;&#93;
+    pub const REGNAMESPACE_ARRAY: Type = Type(Inner::RegnamespaceArray);
+
+    /// REGROLE - registered role
+    pub const REGROLE: Type = Type(Inner::Regrole);
+
+    /// REGROLE&#91;&#93;
+    pub const REGROLE_ARRAY: Type = Type(Inner::RegroleArray);
+
+    /// REGCOLLATION - registered collation
+    pub const REGCOLLATION: Type = Type(Inner::Regcollation);
+
+    /// REGCOLLATION&#91;&#93;
+    pub const REGCOLLATION_ARRAY: Type = Type(Inner::RegcollationArray);
+
+    /// INT4MULTIRANGE - multirange of integers
+    pub const INT4MULTI_RANGE: Type = Type(Inner::Int4multiRange);
+
+    /// NUMMULTIRANGE - multirange of numerics
+    pub const NUMMULTI_RANGE: Type = Type(Inner::NummultiRange);
+
+    /// TSMULTIRANGE - multirange of timestamps without time zone
+    pub const TSMULTI_RANGE: Type = Type(Inner::TsmultiRange);
+
+    /// TSTZMULTIRANGE - multirange of timestamps with time zone
+    pub const TSTZMULTI_RANGE: Type = Type(Inner::TstzmultiRange);
+
+    /// DATEMULTIRANGE - multirange of dates
+    pub const DATEMULTI_RANGE: Type = Type(Inner::DatemultiRange);
+
+    /// INT8MULTIRANGE - multirange of bigints
+    pub const INT8MULTI_RANGE: Type = Type(Inner::Int8multiRange);
+
+    /// ANYMULTIRANGE - pseudo-type representing a polymorphic base type that is a multirange
+    pub const ANYMULTI_RANGE: Type = Type(Inner::AnymultiRange);
+
+    /// ANYCOMPATIBLEMULTIRANGE - pseudo-type representing a multirange over a polymorphic common type
+    pub const ANYCOMPATIBLEMULTI_RANGE: Type = Type(Inner::AnycompatiblemultiRange);
+
+    /// PG_BRIN_BLOOM_SUMMARY - BRIN bloom summary
+    pub const PG_BRIN_BLOOM_SUMMARY: Type = Type(Inner::PgBrinBloomSummary);
+
+    /// PG_BRIN_MINMAX_MULTI_SUMMARY - BRIN minmax-multi summary
+    pub const PG_BRIN_MINMAX_MULTI_SUMMARY: Type = Type(Inner::PgBrinMinmaxMultiSummary);
+
+    /// PG_MCV_LIST - multivariate MCV list
+    pub const PG_MCV_LIST: Type = Type(Inner::PgMcvList);
+
+    /// PG_SNAPSHOT - snapshot
+    pub const PG_SNAPSHOT: Type = Type(Inner::PgSnapshot);
+
+    /// PG_SNAPSHOT&#91;&#93;
+    pub const PG_SNAPSHOT_ARRAY: Type = Type(Inner::PgSnapshotArray);
+
+    /// XID8 - full transaction id
+    pub const XID8: Type = Type(Inner::Xid8);
+
+    /// ANYCOMPATIBLE - pseudo-type representing a polymorphic common type
+    pub const ANYCOMPATIBLE: Type = Type(Inner::Anycompatible);
+
+    /// ANYCOMPATIBLEARRAY - pseudo-type representing an array of polymorphic common type elements
+    pub const ANYCOMPATIBLEARRAY: Type = Type(Inner::Anycompatiblearray);
+
+    /// ANYCOMPATIBLENONARRAY - pseudo-type representing a polymorphic common type that is not an array
+    pub const ANYCOMPATIBLENONARRAY: Type = Type(Inner::Anycompatiblenonarray);
+
+    /// ANYCOMPATIBLERANGE - pseudo-type representing a range over a polymorphic common type
+    pub const ANYCOMPATIBLE_RANGE: Type = Type(Inner::AnycompatibleRange);
+
+    /// INT4MULTIRANGE&#91;&#93;
+    pub const INT4MULTI_RANGE_ARRAY: Type = Type(Inner::Int4multiRangeArray);
+
+    /// NUMMULTIRANGE&#91;&#93;
+    pub const NUMMULTI_RANGE_ARRAY: Type = Type(Inner::NummultiRangeArray);
+
+    /// TSMULTIRANGE&#91;&#93;
+    pub const TSMULTI_RANGE_ARRAY: Type = Type(Inner::TsmultiRangeArray);
+
+    /// TSTZMULTIRANGE&#91;&#93;
+    pub const TSTZMULTI_RANGE_ARRAY: Type = Type(Inner::TstzmultiRangeArray);
+
+    /// DATEMULTIRANGE&#91;&#93;
+    pub const DATEMULTI_RANGE_ARRAY: Type = Type(Inner::DatemultiRangeArray);
+
+    /// INT8MULTIRANGE&#91;&#93;
+    pub const INT8MULTI_RANGE_ARRAY: Type = Type(Inner::Int8multiRangeArray);
+}
diff --git a/libs/proxy/tokio-postgres2/Cargo.toml b/libs/proxy/tokio-postgres2/Cargo.toml
new file mode 100644
index 0000000000..7130c1b726
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "tokio-postgres2"
+version = "0.1.0"
+edition = "2018"
+license = "MIT/Apache-2.0"
+
+[dependencies]
+async-trait.workspace = true
+bytes.workspace = true
+byteorder.workspace = true
+fallible-iterator.workspace = true
+futures-util = { workspace = true, features = ["sink"] }
+log = "0.4"
+parking_lot.workspace = true
+percent-encoding = "2.0"
+pin-project-lite.workspace = true
+phf = "0.11"
+postgres-protocol2 = { path = "../postgres-protocol2" }
+postgres-types2 = { path = "../postgres-types2" }
+tokio = { workspace = true, features = ["io-util", "time", "net"] }
+tokio-util = { workspace = true, features = ["codec"] }
diff --git a/libs/proxy/tokio-postgres2/src/cancel_query.rs b/libs/proxy/tokio-postgres2/src/cancel_query.rs
new file mode 100644
index 0000000000..cddbf16336
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs
@@ -0,0 +1,40 @@
+use tokio::net::TcpStream;
+
+use crate::client::SocketConfig;
+use crate::config::{Host, SslMode};
+use crate::tls::MakeTlsConnect;
+use crate::{cancel_query_raw, connect_socket, Error};
+use std::io;
+
+pub(crate) async fn cancel_query<T>(
+    config: Option<SocketConfig>,
+    ssl_mode: SslMode,
+    mut tls: T,
+    process_id: i32,
+    secret_key: i32,
+) -> Result<(), Error>
+where
+    T: MakeTlsConnect<TcpStream>,
+{
+    let config = match config {
+        Some(config) => config,
+        None => {
+            return Err(Error::connect(io::Error::new(
+                io::ErrorKind::InvalidInput,
+                "unknown host",
+            )))
+        }
+    };
+
+    let hostname = match &config.host {
+        Host::Tcp(host) => &**host,
+    };
+    let tls = tls
+        .make_tls_connect(hostname)
+        .map_err(|e| Error::tls(e.into()))?;
+
+    let socket =
+        connect_socket::connect_socket(&config.host, config.port, config.connect_timeout).await?;
+
+    cancel_query_raw::cancel_query_raw(socket, ssl_mode, tls, process_id, secret_key).await
+}
diff --git a/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs b/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs
new file mode 100644
index 0000000000..8c08296435
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/cancel_query_raw.rs
@@ -0,0 +1,29 @@
+use crate::config::SslMode;
+use crate::tls::TlsConnect;
+use crate::{connect_tls, Error};
+use bytes::BytesMut;
+use postgres_protocol2::message::frontend;
+use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
+
+pub async fn cancel_query_raw<S, T>(
+    stream: S,
+    mode: SslMode,
+    tls: T,
+    process_id: i32,
+    secret_key: i32,
+) -> Result<(), Error>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: TlsConnect<S>,
+{
+    let mut stream = connect_tls::connect_tls(stream, mode, tls).await?;
+
+    let mut buf = BytesMut::new();
+    frontend::cancel_request(process_id, secret_key, &mut buf);
+
+    stream.write_all(&buf).await.map_err(Error::io)?;
+    stream.flush().await.map_err(Error::io)?;
+    stream.shutdown().await.map_err(Error::io)?;
+
+    Ok(())
+}
diff --git a/libs/proxy/tokio-postgres2/src/cancel_token.rs b/libs/proxy/tokio-postgres2/src/cancel_token.rs
new file mode 100644
index 0000000000..b949bf358f
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs
@@ -0,0 +1,62 @@
+use crate::config::SslMode;
+use crate::tls::TlsConnect;
+
+use crate::{cancel_query, client::SocketConfig, tls::MakeTlsConnect};
+use crate::{cancel_query_raw, Error};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::net::TcpStream;
+
+/// The capability to request cancellation of in-progress queries on a
+/// connection.
+#[derive(Clone)]
+pub struct CancelToken {
+    pub(crate) socket_config: Option<SocketConfig>,
+    pub(crate) ssl_mode: SslMode,
+    pub(crate) process_id: i32,
+    pub(crate) secret_key: i32,
+}
+
+impl CancelToken {
+    /// Attempts to cancel the in-progress query on the connection associated
+    /// with this `CancelToken`.
+    ///
+    /// The server provides no information about whether a cancellation attempt was successful or not. An error will
+    /// only be returned if the client was unable to connect to the database.
+    ///
+    /// Cancellation is inherently racy. There is no guarantee that the
+    /// cancellation request will reach the server before the query terminates
+    /// normally, or that the connection associated with this token is still
+    /// active.
+    ///
+    /// Requires the `runtime` Cargo feature (enabled by default).
+    pub async fn cancel_query<T>(&self, tls: T) -> Result<(), Error>
+    where
+        T: MakeTlsConnect<TcpStream>,
+    {
+        cancel_query::cancel_query(
+            self.socket_config.clone(),
+            self.ssl_mode,
+            tls,
+            self.process_id,
+            self.secret_key,
+        )
+        .await
+    }
+
+    /// Like `cancel_query`, but uses a stream which is already connected to the server rather than opening a new
+    /// connection itself.
+    pub async fn cancel_query_raw<S, T>(&self, stream: S, tls: T) -> Result<(), Error>
+    where
+        S: AsyncRead + AsyncWrite + Unpin,
+        T: TlsConnect<S>,
+    {
+        cancel_query_raw::cancel_query_raw(
+            stream,
+            self.ssl_mode,
+            tls,
+            self.process_id,
+            self.secret_key,
+        )
+        .await
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs
new file mode 100644
index 0000000000..96200b71e7
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -0,0 +1,439 @@
+use crate::codec::{BackendMessages, FrontendMessage};
+
+use crate::config::Host;
+use crate::config::SslMode;
+use crate::connection::{Request, RequestMessages};
+
+use crate::query::RowStream;
+use crate::simple_query::SimpleQueryStream;
+
+use crate::types::{Oid, ToSql, Type};
+
+use crate::{
+    prepare, query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row,
+    SimpleQueryMessage, Statement, ToStatement, Transaction, TransactionBuilder,
+};
+use bytes::BytesMut;
+use fallible_iterator::FallibleIterator;
+use futures_util::{future, ready, TryStreamExt};
+use parking_lot::Mutex;
+use postgres_protocol2::message::{backend::Message, frontend};
+use std::collections::HashMap;
+use std::fmt;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+use tokio::sync::mpsc;
+
+use std::time::Duration;
+
+pub struct Responses {
+    receiver: mpsc::Receiver<BackendMessages>,
+    cur: BackendMessages,
+}
+
+impl Responses {
+    pub fn poll_next(&mut self, cx: &mut Context<'_>) -> Poll<Result<Message, Error>> {
+        loop {
+            match self.cur.next().map_err(Error::parse)? {
+                Some(Message::ErrorResponse(body)) => return Poll::Ready(Err(Error::db(body))),
+                Some(message) => return Poll::Ready(Ok(message)),
+                None => {}
+            }
+
+            match ready!(self.receiver.poll_recv(cx)) {
+                Some(messages) => self.cur = messages,
+                None => return Poll::Ready(Err(Error::closed())),
+            }
+        }
+    }
+
+    pub async fn next(&mut self) -> Result<Message, Error> {
+        future::poll_fn(|cx| self.poll_next(cx)).await
+    }
+}
+
+/// A cache of type info and prepared statements for fetching type info
+/// (corresponding to the queries in the [prepare] module).
+#[derive(Default)]
+struct CachedTypeInfo {
+    /// A statement for basic information for a type from its
+    /// OID. Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_QUERY) (or its
+    /// fallback).
+    typeinfo: Option<Statement>,
+    /// A statement for getting information for a composite type from its OID.
+    /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY).
+    typeinfo_composite: Option<Statement>,
+    /// A statement for getting information for a composite type from its OID.
+    /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY) (or
+    /// its fallback).
+    typeinfo_enum: Option<Statement>,
+
+    /// Cache of types already looked up.
+    types: HashMap<Oid, Type>,
+}
+
+pub struct InnerClient {
+    sender: mpsc::UnboundedSender<Request>,
+    cached_typeinfo: Mutex<CachedTypeInfo>,
+
+    /// A buffer to use when writing out postgres commands.
+    buffer: Mutex<BytesMut>,
+}
+
+impl InnerClient {
+    pub fn send(&self, messages: RequestMessages) -> Result<Responses, Error> {
+        let (sender, receiver) = mpsc::channel(1);
+        let request = Request { messages, sender };
+        self.sender.send(request).map_err(|_| Error::closed())?;
+
+        Ok(Responses {
+            receiver,
+            cur: BackendMessages::empty(),
+        })
+    }
+
+    pub fn typeinfo(&self) -> Option<Statement> {
+        self.cached_typeinfo.lock().typeinfo.clone()
+    }
+
+    pub fn set_typeinfo(&self, statement: &Statement) {
+        self.cached_typeinfo.lock().typeinfo = Some(statement.clone());
+    }
+
+    pub fn typeinfo_composite(&self) -> Option<Statement> {
+        self.cached_typeinfo.lock().typeinfo_composite.clone()
+    }
+
+    pub fn set_typeinfo_composite(&self, statement: &Statement) {
+        self.cached_typeinfo.lock().typeinfo_composite = Some(statement.clone());
+    }
+
+    pub fn typeinfo_enum(&self) -> Option<Statement> {
+        self.cached_typeinfo.lock().typeinfo_enum.clone()
+    }
+
+    pub fn set_typeinfo_enum(&self, statement: &Statement) {
+        self.cached_typeinfo.lock().typeinfo_enum = Some(statement.clone());
+    }
+
+    pub fn type_(&self, oid: Oid) -> Option<Type> {
+        self.cached_typeinfo.lock().types.get(&oid).cloned()
+    }
+
+    pub fn set_type(&self, oid: Oid, type_: &Type) {
+        self.cached_typeinfo.lock().types.insert(oid, type_.clone());
+    }
+
+    /// Call the given function with a buffer to be used when writing out
+    /// postgres commands.
+    pub fn with_buf<F, R>(&self, f: F) -> R
+    where
+        F: FnOnce(&mut BytesMut) -> R,
+    {
+        let mut buffer = self.buffer.lock();
+        let r = f(&mut buffer);
+        buffer.clear();
+        r
+    }
+}
+
+#[derive(Clone)]
+pub(crate) struct SocketConfig {
+    pub host: Host,
+    pub port: u16,
+    pub connect_timeout: Option<Duration>,
+    // pub keepalive: Option<KeepaliveConfig>,
+}
+
+/// An asynchronous PostgreSQL client.
+///
+/// The client is one half of what is returned when a connection is established. Users interact with the database
+/// through this client object.
+pub struct Client {
+    inner: Arc<InnerClient>,
+
+    socket_config: Option<SocketConfig>,
+    ssl_mode: SslMode,
+    process_id: i32,
+    secret_key: i32,
+}
+
+impl Client {
+    pub(crate) fn new(
+        sender: mpsc::UnboundedSender<Request>,
+        ssl_mode: SslMode,
+        process_id: i32,
+        secret_key: i32,
+    ) -> Client {
+        Client {
+            inner: Arc::new(InnerClient {
+                sender,
+                cached_typeinfo: Default::default(),
+                buffer: Default::default(),
+            }),
+
+            socket_config: None,
+            ssl_mode,
+            process_id,
+            secret_key,
+        }
+    }
+
+    /// Returns process_id.
+    pub fn get_process_id(&self) -> i32 {
+        self.process_id
+    }
+
+    pub(crate) fn inner(&self) -> &Arc<InnerClient> {
+        &self.inner
+    }
+
+    pub(crate) fn set_socket_config(&mut self, socket_config: SocketConfig) {
+        self.socket_config = Some(socket_config);
+    }
+
+    /// Creates a new prepared statement.
+    ///
+    /// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc),
+    /// which are set when executed. Prepared statements can only be used with the connection that created them.
+    pub async fn prepare(&self, query: &str) -> Result<Statement, Error> {
+        self.prepare_typed(query, &[]).await
+    }
+
+    /// Like `prepare`, but allows the types of query parameters to be explicitly specified.
+    ///
+    /// The list of types may be smaller than the number of parameters - the types of the remaining parameters will be
+    /// inferred. For example, `client.prepare_typed(query, &[])` is equivalent to `client.prepare(query)`.
+    pub async fn prepare_typed(
+        &self,
+        query: &str,
+        parameter_types: &[Type],
+    ) -> Result<Statement, Error> {
+        prepare::prepare(&self.inner, query, parameter_types).await
+    }
+
+    /// Executes a statement, returning a vector of the resulting rows.
+    ///
+    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
+    /// provided, 1-indexed.
+    ///
+    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
+    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
+    /// with the `prepare` method.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the number of parameters provided does not match the number expected.
+    pub async fn query<T>(
+        &self,
+        statement: &T,
+        params: &[&(dyn ToSql + Sync)],
+    ) -> Result<Vec<Row>, Error>
+    where
+        T: ?Sized + ToStatement,
+    {
+        self.query_raw(statement, slice_iter(params))
+            .await?
+            .try_collect()
+            .await
+    }
+
+    /// The maximally flexible version of [`query`].
+    ///
+    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
+    /// provided, 1-indexed.
+    ///
+    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
+    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
+    /// with the `prepare` method.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the number of parameters provided does not match the number expected.
+    ///
+    /// [`query`]: #method.query
+    pub async fn query_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<RowStream, Error>
+    where
+        T: ?Sized + ToStatement,
+        I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
+        I::IntoIter: ExactSizeIterator,
+    {
+        let statement = statement.__convert().into_statement(self).await?;
+        query::query(&self.inner, statement, params).await
+    }
+
+    /// Pass text directly to the Postgres backend to allow it to sort out typing itself and
+    /// to save a roundtrip
+    pub async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    where
+        S: AsRef<str>,
+        I: IntoIterator<Item = Option<S>>,
+        I::IntoIter: ExactSizeIterator,
+    {
+        query::query_txt(&self.inner, statement, params).await
+    }
+
+    /// Executes a statement, returning the number of rows modified.
+    ///
+    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
+    /// provided, 1-indexed.
+    ///
+    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
+    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
+    /// with the `prepare` method.
+    ///
+    /// If the statement does not modify any rows (e.g. `SELECT`), 0 is returned.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the number of parameters provided does not match the number expected.
+    pub async fn execute<T>(
+        &self,
+        statement: &T,
+        params: &[&(dyn ToSql + Sync)],
+    ) -> Result<u64, Error>
+    where
+        T: ?Sized + ToStatement,
+    {
+        self.execute_raw(statement, slice_iter(params)).await
+    }
+
+    /// The maximally flexible version of [`execute`].
+    ///
+    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
+    /// provided, 1-indexed.
+    ///
+    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
+    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
+    /// with the `prepare` method.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the number of parameters provided does not match the number expected.
+    ///
+    /// [`execute`]: #method.execute
+    pub async fn execute_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<u64, Error>
+    where
+        T: ?Sized + ToStatement,
+        I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
+        I::IntoIter: ExactSizeIterator,
+    {
+        let statement = statement.__convert().into_statement(self).await?;
+        query::execute(self.inner(), statement, params).await
+    }
+
+    /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows.
+    ///
+    /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that
+    /// point. The simple query protocol returns the values in rows as strings rather than in their binary encodings,
+    /// so the associated row type doesn't work with the `FromSql` trait. Rather than simply returning a list of the
+    /// rows, this method returns a list of an enum which indicates either the completion of one of the commands,
+    /// or a row of data. This preserves the framing between the separate statements in the request.
+    ///
+    /// # Warning
+    ///
+    /// Prepared statements should be use for any query which contains user-specified data, as they provided the
+    /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass
+    /// them to this method!
+    pub async fn simple_query(&self, query: &str) -> Result<Vec<SimpleQueryMessage>, Error> {
+        self.simple_query_raw(query).await?.try_collect().await
+    }
+
+    pub(crate) async fn simple_query_raw(&self, query: &str) -> Result<SimpleQueryStream, Error> {
+        simple_query::simple_query(self.inner(), query).await
+    }
+
+    /// Executes a sequence of SQL statements using the simple query protocol.
+    ///
+    /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that
+    /// point. This is intended for use when, for example, initializing a database schema.
+    ///
+    /// # Warning
+    ///
+    /// Prepared statements should be use for any query which contains user-specified data, as they provided the
+    /// functionality to safely embed that data in the request. Do not form statements via string concatenation and pass
+    /// them to this method!
+    pub async fn batch_execute(&self, query: &str) -> Result<ReadyForQueryStatus, Error> {
+        simple_query::batch_execute(self.inner(), query).await
+    }
+
+    /// Begins a new database transaction.
+    ///
+    /// The transaction will roll back by default - use the `commit` method to commit it.
+    pub async fn transaction(&mut self) -> Result<Transaction<'_>, Error> {
+        struct RollbackIfNotDone<'me> {
+            client: &'me Client,
+            done: bool,
+        }
+
+        impl Drop for RollbackIfNotDone<'_> {
+            fn drop(&mut self) {
+                if self.done {
+                    return;
+                }
+
+                let buf = self.client.inner().with_buf(|buf| {
+                    frontend::query("ROLLBACK", buf).unwrap();
+                    buf.split().freeze()
+                });
+                let _ = self
+                    .client
+                    .inner()
+                    .send(RequestMessages::Single(FrontendMessage::Raw(buf)));
+            }
+        }
+
+        // This is done, as `Future` created by this method can be dropped after
+        // `RequestMessages` is synchronously send to the `Connection` by
+        // `batch_execute()`, but before `Responses` is asynchronously polled to
+        // completion. In that case `Transaction` won't be created and thus
+        // won't be rolled back.
+        {
+            let mut cleaner = RollbackIfNotDone {
+                client: self,
+                done: false,
+            };
+            self.batch_execute("BEGIN").await?;
+            cleaner.done = true;
+        }
+
+        Ok(Transaction::new(self))
+    }
+
+    /// Returns a builder for a transaction with custom settings.
+    ///
+    /// Unlike the `transaction` method, the builder can be used to control the transaction's isolation level and other
+    /// attributes.
+    pub fn build_transaction(&mut self) -> TransactionBuilder<'_> {
+        TransactionBuilder::new(self)
+    }
+
+    /// Constructs a cancellation token that can later be used to request cancellation of a query running on the
+    /// connection associated with this client.
+    pub fn cancel_token(&self) -> CancelToken {
+        CancelToken {
+            socket_config: self.socket_config.clone(),
+            ssl_mode: self.ssl_mode,
+            process_id: self.process_id,
+            secret_key: self.secret_key,
+        }
+    }
+
+    /// Query for type information
+    pub async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
+        crate::prepare::get_type(&self.inner, oid).await
+    }
+
+    /// Determines if the connection to the server has already closed.
+    ///
+    /// In that case, all future queries will fail.
+    pub fn is_closed(&self) -> bool {
+        self.inner.sender.is_closed()
+    }
+}
+
+impl fmt::Debug for Client {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Client").finish()
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/codec.rs b/libs/proxy/tokio-postgres2/src/codec.rs
new file mode 100644
index 0000000000..7412db785b
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/codec.rs
@@ -0,0 +1,109 @@
+use bytes::{Buf, Bytes, BytesMut};
+use fallible_iterator::FallibleIterator;
+use postgres_protocol2::message::backend;
+use postgres_protocol2::message::frontend::CopyData;
+use std::io;
+use tokio_util::codec::{Decoder, Encoder};
+
+pub enum FrontendMessage {
+    Raw(Bytes),
+    CopyData(CopyData<Box<dyn Buf + Send>>),
+}
+
+pub enum BackendMessage {
+    Normal {
+        messages: BackendMessages,
+        request_complete: bool,
+    },
+    Async(backend::Message),
+}
+
+pub struct BackendMessages(BytesMut);
+
+impl BackendMessages {
+    pub fn empty() -> BackendMessages {
+        BackendMessages(BytesMut::new())
+    }
+}
+
+impl FallibleIterator for BackendMessages {
+    type Item = backend::Message;
+    type Error = io::Error;
+
+    fn next(&mut self) -> io::Result<Option<backend::Message>> {
+        backend::Message::parse(&mut self.0)
+    }
+}
+
+pub struct PostgresCodec {
+    pub max_message_size: Option<usize>,
+}
+
+impl Encoder<FrontendMessage> for PostgresCodec {
+    type Error = io::Error;
+
+    fn encode(&mut self, item: FrontendMessage, dst: &mut BytesMut) -> io::Result<()> {
+        match item {
+            FrontendMessage::Raw(buf) => dst.extend_from_slice(&buf),
+            FrontendMessage::CopyData(data) => data.write(dst),
+        }
+
+        Ok(())
+    }
+}
+
+impl Decoder for PostgresCodec {
+    type Item = BackendMessage;
+    type Error = io::Error;
+
+    fn decode(&mut self, src: &mut BytesMut) -> Result<Option<BackendMessage>, io::Error> {
+        let mut idx = 0;
+        let mut request_complete = false;
+
+        while let Some(header) = backend::Header::parse(&src[idx..])? {
+            let len = header.len() as usize + 1;
+            if src[idx..].len() < len {
+                break;
+            }
+
+            if let Some(max) = self.max_message_size {
+                if len > max {
+                    return Err(io::Error::new(
+                        io::ErrorKind::InvalidInput,
+                        "message too large",
+                    ));
+                }
+            }
+
+            match header.tag() {
+                backend::NOTICE_RESPONSE_TAG
+                | backend::NOTIFICATION_RESPONSE_TAG
+                | backend::PARAMETER_STATUS_TAG => {
+                    if idx == 0 {
+                        let message = backend::Message::parse(src)?.unwrap();
+                        return Ok(Some(BackendMessage::Async(message)));
+                    } else {
+                        break;
+                    }
+                }
+                _ => {}
+            }
+
+            idx += len;
+
+            if header.tag() == backend::READY_FOR_QUERY_TAG {
+                request_complete = true;
+                break;
+            }
+        }
+
+        if idx == 0 {
+            Ok(None)
+        } else {
+            Ok(Some(BackendMessage::Normal {
+                messages: BackendMessages(src.split_to(idx)),
+                request_complete,
+            }))
+        }
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
new file mode 100644
index 0000000000..969c20ba47
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -0,0 +1,897 @@
+//! Connection configuration.
+
+use crate::connect::connect;
+use crate::connect_raw::connect_raw;
+use crate::tls::MakeTlsConnect;
+use crate::tls::TlsConnect;
+use crate::{Client, Connection, Error};
+use std::borrow::Cow;
+use std::str;
+use std::str::FromStr;
+use std::time::Duration;
+use std::{error, fmt, iter, mem};
+use tokio::io::{AsyncRead, AsyncWrite};
+
+pub use postgres_protocol2::authentication::sasl::ScramKeys;
+use tokio::net::TcpStream;
+
+/// Properties required of a session.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[non_exhaustive]
+pub enum TargetSessionAttrs {
+    /// No special properties are required.
+    Any,
+    /// The session must allow writes.
+    ReadWrite,
+}
+
+/// TLS configuration.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[non_exhaustive]
+pub enum SslMode {
+    /// Do not use TLS.
+    Disable,
+    /// Attempt to connect with TLS but allow sessions without.
+    Prefer,
+    /// Require the use of TLS.
+    Require,
+}
+
+/// Channel binding configuration.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[non_exhaustive]
+pub enum ChannelBinding {
+    /// Do not use channel binding.
+    Disable,
+    /// Attempt to use channel binding but allow sessions without.
+    Prefer,
+    /// Require the use of channel binding.
+    Require,
+}
+
+/// Replication mode configuration.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[non_exhaustive]
+pub enum ReplicationMode {
+    /// Physical replication.
+    Physical,
+    /// Logical replication.
+    Logical,
+}
+
+/// A host specification.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub enum Host {
+    /// A TCP hostname.
+    Tcp(String),
+}
+
+/// Precomputed keys which may override password during auth.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum AuthKeys {
+    /// A `ClientKey` & `ServerKey` pair for `SCRAM-SHA-256`.
+    ScramSha256(ScramKeys<32>),
+}
+
+/// Connection configuration.
+///
+/// Configuration can be parsed from libpq-style connection strings. These strings come in two formats:
+///
+/// # Key-Value
+///
+/// This format consists of space-separated key-value pairs. Values which are either the empty string or contain
+/// whitespace should be wrapped in `'`. `'` and `\` characters should be backslash-escaped.
+///
+/// ## Keys
+///
+/// * `user` - The username to authenticate with. Required.
+/// * `password` - The password to authenticate with.
+/// * `dbname` - The name of the database to connect to. Defaults to the username.
+/// * `options` - Command line options used to configure the server.
+/// * `application_name` - Sets the `application_name` parameter on the server.
+/// * `sslmode` - Controls usage of TLS. If set to `disable`, TLS will not be used. If set to `prefer`, TLS will be used
+///     if available, but not used otherwise. If set to `require`, TLS will be forced to be used. Defaults to `prefer`.
+/// * `host` - The host to connect to. On Unix platforms, if the host starts with a `/` character it is treated as the
+///     path to the directory containing Unix domain sockets. Otherwise, it is treated as a hostname. Multiple hosts
+///     can be specified, separated by commas. Each host will be tried in turn when connecting. Required if connecting
+///     with the `connect` method.
+/// * `port` - The port to connect to. Multiple ports can be specified, separated by commas. The number of ports must be
+///     either 1, in which case it will be used for all hosts, or the same as the number of hosts. Defaults to 5432 if
+///     omitted or the empty string.
+/// * `connect_timeout` - The time limit in seconds applied to each socket-level connection attempt. Note that hostnames
+///     can resolve to multiple IP addresses, and this limit is applied to each address. Defaults to no timeout.
+/// * `target_session_attrs` - Specifies requirements of the session. If set to `read-write`, the client will check that
+///     the `transaction_read_write` session parameter is set to `on`. This can be used to connect to the primary server
+///     in a database cluster as opposed to the secondary read-only mirrors. Defaults to `all`.
+/// * `channel_binding` - Controls usage of channel binding in the authentication process. If set to `disable`, channel
+///     binding will not be used. If set to `prefer`, channel binding will be used if available, but not used otherwise.
+///     If set to `require`, the authentication process will fail if channel binding is not used. Defaults to `prefer`.
+///
+/// ## Examples
+///
+/// ```not_rust
+/// host=localhost user=postgres connect_timeout=10 keepalives=0
+/// ```
+///
+/// ```not_rust
+/// host=/var/lib/postgresql,localhost port=1234 user=postgres password='password with spaces'
+/// ```
+///
+/// ```not_rust
+/// host=host1,host2,host3 port=1234,,5678 user=postgres target_session_attrs=read-write
+/// ```
+///
+/// # Url
+///
+/// This format resembles a URL with a scheme of either `postgres://` or `postgresql://`. All components are optional,
+/// and the format accepts query parameters for all of the key-value pairs described in the section above. Multiple
+/// host/port pairs can be comma-separated. Unix socket paths in the host section of the URL should be percent-encoded,
+/// as the path component of the URL specifies the database name.
+///
+/// ## Examples
+///
+/// ```not_rust
+/// postgresql://user@localhost
+/// ```
+///
+/// ```not_rust
+/// postgresql://user:password@%2Fvar%2Flib%2Fpostgresql/mydb?connect_timeout=10
+/// ```
+///
+/// ```not_rust
+/// postgresql://user@host1:1234,host2,host3:5678?target_session_attrs=read-write
+/// ```
+///
+/// ```not_rust
+/// postgresql:///mydb?user=user&host=/var/lib/postgresql
+/// ```
+#[derive(Clone, PartialEq, Eq)]
+pub struct Config {
+    pub(crate) user: Option<String>,
+    pub(crate) password: Option<Vec<u8>>,
+    pub(crate) auth_keys: Option<Box<AuthKeys>>,
+    pub(crate) dbname: Option<String>,
+    pub(crate) options: Option<String>,
+    pub(crate) application_name: Option<String>,
+    pub(crate) ssl_mode: SslMode,
+    pub(crate) host: Vec<Host>,
+    pub(crate) port: Vec<u16>,
+    pub(crate) connect_timeout: Option<Duration>,
+    pub(crate) target_session_attrs: TargetSessionAttrs,
+    pub(crate) channel_binding: ChannelBinding,
+    pub(crate) replication_mode: Option<ReplicationMode>,
+    pub(crate) max_backend_message_size: Option<usize>,
+}
+
+impl Default for Config {
+    fn default() -> Config {
+        Config::new()
+    }
+}
+
+impl Config {
+    /// Creates a new configuration.
+    pub fn new() -> Config {
+        Config {
+            user: None,
+            password: None,
+            auth_keys: None,
+            dbname: None,
+            options: None,
+            application_name: None,
+            ssl_mode: SslMode::Prefer,
+            host: vec![],
+            port: vec![],
+            connect_timeout: None,
+            target_session_attrs: TargetSessionAttrs::Any,
+            channel_binding: ChannelBinding::Prefer,
+            replication_mode: None,
+            max_backend_message_size: None,
+        }
+    }
+
+    /// Sets the user to authenticate with.
+    ///
+    /// Required.
+    pub fn user(&mut self, user: &str) -> &mut Config {
+        self.user = Some(user.to_string());
+        self
+    }
+
+    /// Gets the user to authenticate with, if one has been configured with
+    /// the `user` method.
+    pub fn get_user(&self) -> Option<&str> {
+        self.user.as_deref()
+    }
+
+    /// Sets the password to authenticate with.
+    pub fn password<T>(&mut self, password: T) -> &mut Config
+    where
+        T: AsRef<[u8]>,
+    {
+        self.password = Some(password.as_ref().to_vec());
+        self
+    }
+
+    /// Gets the password to authenticate with, if one has been configured with
+    /// the `password` method.
+    pub fn get_password(&self) -> Option<&[u8]> {
+        self.password.as_deref()
+    }
+
+    /// Sets precomputed protocol-specific keys to authenticate with.
+    /// When set, this option will override `password`.
+    /// See [`AuthKeys`] for more information.
+    pub fn auth_keys(&mut self, keys: AuthKeys) -> &mut Config {
+        self.auth_keys = Some(Box::new(keys));
+        self
+    }
+
+    /// Gets precomputed protocol-specific keys to authenticate with.
+    /// if one has been configured with the `auth_keys` method.
+    pub fn get_auth_keys(&self) -> Option<AuthKeys> {
+        self.auth_keys.as_deref().copied()
+    }
+
+    /// Sets the name of the database to connect to.
+    ///
+    /// Defaults to the user.
+    pub fn dbname(&mut self, dbname: &str) -> &mut Config {
+        self.dbname = Some(dbname.to_string());
+        self
+    }
+
+    /// Gets the name of the database to connect to, if one has been configured
+    /// with the `dbname` method.
+    pub fn get_dbname(&self) -> Option<&str> {
+        self.dbname.as_deref()
+    }
+
+    /// Sets command line options used to configure the server.
+    pub fn options(&mut self, options: &str) -> &mut Config {
+        self.options = Some(options.to_string());
+        self
+    }
+
+    /// Gets the command line options used to configure the server, if the
+    /// options have been set with the `options` method.
+    pub fn get_options(&self) -> Option<&str> {
+        self.options.as_deref()
+    }
+
+    /// Sets the value of the `application_name` runtime parameter.
+    pub fn application_name(&mut self, application_name: &str) -> &mut Config {
+        self.application_name = Some(application_name.to_string());
+        self
+    }
+
+    /// Gets the value of the `application_name` runtime parameter, if it has
+    /// been set with the `application_name` method.
+    pub fn get_application_name(&self) -> Option<&str> {
+        self.application_name.as_deref()
+    }
+
+    /// Sets the SSL configuration.
+    ///
+    /// Defaults to `prefer`.
+    pub fn ssl_mode(&mut self, ssl_mode: SslMode) -> &mut Config {
+        self.ssl_mode = ssl_mode;
+        self
+    }
+
+    /// Gets the SSL configuration.
+    pub fn get_ssl_mode(&self) -> SslMode {
+        self.ssl_mode
+    }
+
+    /// Adds a host to the configuration.
+    ///
+    /// Multiple hosts can be specified by calling this method multiple times, and each will be tried in order.
+    pub fn host(&mut self, host: &str) -> &mut Config {
+        self.host.push(Host::Tcp(host.to_string()));
+        self
+    }
+
+    /// Gets the hosts that have been added to the configuration with `host`.
+    pub fn get_hosts(&self) -> &[Host] {
+        &self.host
+    }
+
+    /// Adds a port to the configuration.
+    ///
+    /// Multiple ports can be specified by calling this method multiple times. There must either be no ports, in which
+    /// case the default of 5432 is used, a single port, in which it is used for all hosts, or the same number of ports
+    /// as hosts.
+    pub fn port(&mut self, port: u16) -> &mut Config {
+        self.port.push(port);
+        self
+    }
+
+    /// Gets the ports that have been added to the configuration with `port`.
+    pub fn get_ports(&self) -> &[u16] {
+        &self.port
+    }
+
+    /// Sets the timeout applied to socket-level connection attempts.
+    ///
+    /// Note that hostnames can resolve to multiple IP addresses, and this timeout will apply to each address of each
+    /// host separately. Defaults to no limit.
+    pub fn connect_timeout(&mut self, connect_timeout: Duration) -> &mut Config {
+        self.connect_timeout = Some(connect_timeout);
+        self
+    }
+
+    /// Gets the connection timeout, if one has been set with the
+    /// `connect_timeout` method.
+    pub fn get_connect_timeout(&self) -> Option<&Duration> {
+        self.connect_timeout.as_ref()
+    }
+
+    /// Sets the requirements of the session.
+    ///
+    /// This can be used to connect to the primary server in a clustered database rather than one of the read-only
+    /// secondary servers. Defaults to `Any`.
+    pub fn target_session_attrs(
+        &mut self,
+        target_session_attrs: TargetSessionAttrs,
+    ) -> &mut Config {
+        self.target_session_attrs = target_session_attrs;
+        self
+    }
+
+    /// Gets the requirements of the session.
+    pub fn get_target_session_attrs(&self) -> TargetSessionAttrs {
+        self.target_session_attrs
+    }
+
+    /// Sets the channel binding behavior.
+    ///
+    /// Defaults to `prefer`.
+    pub fn channel_binding(&mut self, channel_binding: ChannelBinding) -> &mut Config {
+        self.channel_binding = channel_binding;
+        self
+    }
+
+    /// Gets the channel binding behavior.
+    pub fn get_channel_binding(&self) -> ChannelBinding {
+        self.channel_binding
+    }
+
+    /// Set replication mode.
+    pub fn replication_mode(&mut self, replication_mode: ReplicationMode) -> &mut Config {
+        self.replication_mode = Some(replication_mode);
+        self
+    }
+
+    /// Get replication mode.
+    pub fn get_replication_mode(&self) -> Option<ReplicationMode> {
+        self.replication_mode
+    }
+
+    /// Set limit for backend messages size.
+    pub fn max_backend_message_size(&mut self, max_backend_message_size: usize) -> &mut Config {
+        self.max_backend_message_size = Some(max_backend_message_size);
+        self
+    }
+
+    /// Get limit for backend messages size.
+    pub fn get_max_backend_message_size(&self) -> Option<usize> {
+        self.max_backend_message_size
+    }
+
+    fn param(&mut self, key: &str, value: &str) -> Result<(), Error> {
+        match key {
+            "user" => {
+                self.user(value);
+            }
+            "password" => {
+                self.password(value);
+            }
+            "dbname" => {
+                self.dbname(value);
+            }
+            "options" => {
+                self.options(value);
+            }
+            "application_name" => {
+                self.application_name(value);
+            }
+            "sslmode" => {
+                let mode = match value {
+                    "disable" => SslMode::Disable,
+                    "prefer" => SslMode::Prefer,
+                    "require" => SslMode::Require,
+                    _ => return Err(Error::config_parse(Box::new(InvalidValue("sslmode")))),
+                };
+                self.ssl_mode(mode);
+            }
+            "host" => {
+                for host in value.split(',') {
+                    self.host(host);
+                }
+            }
+            "port" => {
+                for port in value.split(',') {
+                    let port = if port.is_empty() {
+                        5432
+                    } else {
+                        port.parse()
+                            .map_err(|_| Error::config_parse(Box::new(InvalidValue("port"))))?
+                    };
+                    self.port(port);
+                }
+            }
+            "connect_timeout" => {
+                let timeout = value
+                    .parse::<i64>()
+                    .map_err(|_| Error::config_parse(Box::new(InvalidValue("connect_timeout"))))?;
+                if timeout > 0 {
+                    self.connect_timeout(Duration::from_secs(timeout as u64));
+                }
+            }
+            "target_session_attrs" => {
+                let target_session_attrs = match value {
+                    "any" => TargetSessionAttrs::Any,
+                    "read-write" => TargetSessionAttrs::ReadWrite,
+                    _ => {
+                        return Err(Error::config_parse(Box::new(InvalidValue(
+                            "target_session_attrs",
+                        ))));
+                    }
+                };
+                self.target_session_attrs(target_session_attrs);
+            }
+            "channel_binding" => {
+                let channel_binding = match value {
+                    "disable" => ChannelBinding::Disable,
+                    "prefer" => ChannelBinding::Prefer,
+                    "require" => ChannelBinding::Require,
+                    _ => {
+                        return Err(Error::config_parse(Box::new(InvalidValue(
+                            "channel_binding",
+                        ))))
+                    }
+                };
+                self.channel_binding(channel_binding);
+            }
+            "max_backend_message_size" => {
+                let limit = value.parse::<usize>().map_err(|_| {
+                    Error::config_parse(Box::new(InvalidValue("max_backend_message_size")))
+                })?;
+                if limit > 0 {
+                    self.max_backend_message_size(limit);
+                }
+            }
+            key => {
+                return Err(Error::config_parse(Box::new(UnknownOption(
+                    key.to_string(),
+                ))));
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Opens a connection to a PostgreSQL database.
+    ///
+    /// Requires the `runtime` Cargo feature (enabled by default).
+    pub async fn connect<T>(
+        &self,
+        tls: T,
+    ) -> Result<(Client, Connection<TcpStream, T::Stream>), Error>
+    where
+        T: MakeTlsConnect<TcpStream>,
+    {
+        connect(tls, self).await
+    }
+
+    /// Connects to a PostgreSQL database over an arbitrary stream.
+    ///
+    /// All of the settings other than `user`, `password`, `dbname`, `options`, and `application_name` name are ignored.
+    pub async fn connect_raw<S, T>(
+        &self,
+        stream: S,
+        tls: T,
+    ) -> Result<(Client, Connection<S, T::Stream>), Error>
+    where
+        S: AsyncRead + AsyncWrite + Unpin,
+        T: TlsConnect<S>,
+    {
+        connect_raw(stream, tls, self).await
+    }
+}
+
+impl FromStr for Config {
+    type Err = Error;
+
+    fn from_str(s: &str) -> Result<Config, Error> {
+        match UrlParser::parse(s)? {
+            Some(config) => Ok(config),
+            None => Parser::parse(s),
+        }
+    }
+}
+
+// Omit password from debug output
+impl fmt::Debug for Config {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        struct Redaction {}
+        impl fmt::Debug for Redaction {
+            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+                write!(f, "_")
+            }
+        }
+
+        f.debug_struct("Config")
+            .field("user", &self.user)
+            .field("password", &self.password.as_ref().map(|_| Redaction {}))
+            .field("dbname", &self.dbname)
+            .field("options", &self.options)
+            .field("application_name", &self.application_name)
+            .field("ssl_mode", &self.ssl_mode)
+            .field("host", &self.host)
+            .field("port", &self.port)
+            .field("connect_timeout", &self.connect_timeout)
+            .field("target_session_attrs", &self.target_session_attrs)
+            .field("channel_binding", &self.channel_binding)
+            .field("replication", &self.replication_mode)
+            .finish()
+    }
+}
+
+#[derive(Debug)]
+struct UnknownOption(String);
+
+impl fmt::Display for UnknownOption {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(fmt, "unknown option `{}`", self.0)
+    }
+}
+
+impl error::Error for UnknownOption {}
+
+#[derive(Debug)]
+struct InvalidValue(&'static str);
+
+impl fmt::Display for InvalidValue {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(fmt, "invalid value for option `{}`", self.0)
+    }
+}
+
+impl error::Error for InvalidValue {}
+
+struct Parser<'a> {
+    s: &'a str,
+    it: iter::Peekable<str::CharIndices<'a>>,
+}
+
+impl<'a> Parser<'a> {
+    fn parse(s: &'a str) -> Result<Config, Error> {
+        let mut parser = Parser {
+            s,
+            it: s.char_indices().peekable(),
+        };
+
+        let mut config = Config::new();
+
+        while let Some((key, value)) = parser.parameter()? {
+            config.param(key, &value)?;
+        }
+
+        Ok(config)
+    }
+
+    fn skip_ws(&mut self) {
+        self.take_while(char::is_whitespace);
+    }
+
+    fn take_while<F>(&mut self, f: F) -> &'a str
+    where
+        F: Fn(char) -> bool,
+    {
+        let start = match self.it.peek() {
+            Some(&(i, _)) => i,
+            None => return "",
+        };
+
+        loop {
+            match self.it.peek() {
+                Some(&(_, c)) if f(c) => {
+                    self.it.next();
+                }
+                Some(&(i, _)) => return &self.s[start..i],
+                None => return &self.s[start..],
+            }
+        }
+    }
+
+    fn eat(&mut self, target: char) -> Result<(), Error> {
+        match self.it.next() {
+            Some((_, c)) if c == target => Ok(()),
+            Some((i, c)) => {
+                let m = format!(
+                    "unexpected character at byte {}: expected `{}` but got `{}`",
+                    i, target, c
+                );
+                Err(Error::config_parse(m.into()))
+            }
+            None => Err(Error::config_parse("unexpected EOF".into())),
+        }
+    }
+
+    fn eat_if(&mut self, target: char) -> bool {
+        match self.it.peek() {
+            Some(&(_, c)) if c == target => {
+                self.it.next();
+                true
+            }
+            _ => false,
+        }
+    }
+
+    fn keyword(&mut self) -> Option<&'a str> {
+        let s = self.take_while(|c| match c {
+            c if c.is_whitespace() => false,
+            '=' => false,
+            _ => true,
+        });
+
+        if s.is_empty() {
+            None
+        } else {
+            Some(s)
+        }
+    }
+
+    fn value(&mut self) -> Result<String, Error> {
+        let value = if self.eat_if('\'') {
+            let value = self.quoted_value()?;
+            self.eat('\'')?;
+            value
+        } else {
+            self.simple_value()?
+        };
+
+        Ok(value)
+    }
+
+    fn simple_value(&mut self) -> Result<String, Error> {
+        let mut value = String::new();
+
+        while let Some(&(_, c)) = self.it.peek() {
+            if c.is_whitespace() {
+                break;
+            }
+
+            self.it.next();
+            if c == '\\' {
+                if let Some((_, c2)) = self.it.next() {
+                    value.push(c2);
+                }
+            } else {
+                value.push(c);
+            }
+        }
+
+        if value.is_empty() {
+            return Err(Error::config_parse("unexpected EOF".into()));
+        }
+
+        Ok(value)
+    }
+
+    fn quoted_value(&mut self) -> Result<String, Error> {
+        let mut value = String::new();
+
+        while let Some(&(_, c)) = self.it.peek() {
+            if c == '\'' {
+                return Ok(value);
+            }
+
+            self.it.next();
+            if c == '\\' {
+                if let Some((_, c2)) = self.it.next() {
+                    value.push(c2);
+                }
+            } else {
+                value.push(c);
+            }
+        }
+
+        Err(Error::config_parse(
+            "unterminated quoted connection parameter value".into(),
+        ))
+    }
+
+    fn parameter(&mut self) -> Result<Option<(&'a str, String)>, Error> {
+        self.skip_ws();
+        let keyword = match self.keyword() {
+            Some(keyword) => keyword,
+            None => return Ok(None),
+        };
+        self.skip_ws();
+        self.eat('=')?;
+        self.skip_ws();
+        let value = self.value()?;
+
+        Ok(Some((keyword, value)))
+    }
+}
+
+// This is a pretty sloppy "URL" parser, but it matches the behavior of libpq, where things really aren't very strict
+struct UrlParser<'a> {
+    s: &'a str,
+    config: Config,
+}
+
+impl<'a> UrlParser<'a> {
+    fn parse(s: &'a str) -> Result<Option<Config>, Error> {
+        let s = match Self::remove_url_prefix(s) {
+            Some(s) => s,
+            None => return Ok(None),
+        };
+
+        let mut parser = UrlParser {
+            s,
+            config: Config::new(),
+        };
+
+        parser.parse_credentials()?;
+        parser.parse_host()?;
+        parser.parse_path()?;
+        parser.parse_params()?;
+
+        Ok(Some(parser.config))
+    }
+
+    fn remove_url_prefix(s: &str) -> Option<&str> {
+        for prefix in &["postgres://", "postgresql://"] {
+            if let Some(stripped) = s.strip_prefix(prefix) {
+                return Some(stripped);
+            }
+        }
+
+        None
+    }
+
+    fn take_until(&mut self, end: &[char]) -> Option<&'a str> {
+        match self.s.find(end) {
+            Some(pos) => {
+                let (head, tail) = self.s.split_at(pos);
+                self.s = tail;
+                Some(head)
+            }
+            None => None,
+        }
+    }
+
+    fn take_all(&mut self) -> &'a str {
+        mem::take(&mut self.s)
+    }
+
+    fn eat_byte(&mut self) {
+        self.s = &self.s[1..];
+    }
+
+    fn parse_credentials(&mut self) -> Result<(), Error> {
+        let creds = match self.take_until(&['@']) {
+            Some(creds) => creds,
+            None => return Ok(()),
+        };
+        self.eat_byte();
+
+        let mut it = creds.splitn(2, ':');
+        let user = self.decode(it.next().unwrap())?;
+        self.config.user(&user);
+
+        if let Some(password) = it.next() {
+            let password = Cow::from(percent_encoding::percent_decode(password.as_bytes()));
+            self.config.password(password);
+        }
+
+        Ok(())
+    }
+
+    fn parse_host(&mut self) -> Result<(), Error> {
+        let host = match self.take_until(&['/', '?']) {
+            Some(host) => host,
+            None => self.take_all(),
+        };
+
+        if host.is_empty() {
+            return Ok(());
+        }
+
+        for chunk in host.split(',') {
+            let (host, port) = if chunk.starts_with('[') {
+                let idx = match chunk.find(']') {
+                    Some(idx) => idx,
+                    None => return Err(Error::config_parse(InvalidValue("host").into())),
+                };
+
+                let host = &chunk[1..idx];
+                let remaining = &chunk[idx + 1..];
+                let port = if let Some(port) = remaining.strip_prefix(':') {
+                    Some(port)
+                } else if remaining.is_empty() {
+                    None
+                } else {
+                    return Err(Error::config_parse(InvalidValue("host").into()));
+                };
+
+                (host, port)
+            } else {
+                let mut it = chunk.splitn(2, ':');
+                (it.next().unwrap(), it.next())
+            };
+
+            self.host_param(host)?;
+            let port = self.decode(port.unwrap_or("5432"))?;
+            self.config.param("port", &port)?;
+        }
+
+        Ok(())
+    }
+
+    fn parse_path(&mut self) -> Result<(), Error> {
+        if !self.s.starts_with('/') {
+            return Ok(());
+        }
+        self.eat_byte();
+
+        let dbname = match self.take_until(&['?']) {
+            Some(dbname) => dbname,
+            None => self.take_all(),
+        };
+
+        if !dbname.is_empty() {
+            self.config.dbname(&self.decode(dbname)?);
+        }
+
+        Ok(())
+    }
+
+    fn parse_params(&mut self) -> Result<(), Error> {
+        if !self.s.starts_with('?') {
+            return Ok(());
+        }
+        self.eat_byte();
+
+        while !self.s.is_empty() {
+            let key = match self.take_until(&['=']) {
+                Some(key) => self.decode(key)?,
+                None => return Err(Error::config_parse("unterminated parameter".into())),
+            };
+            self.eat_byte();
+
+            let value = match self.take_until(&['&']) {
+                Some(value) => {
+                    self.eat_byte();
+                    value
+                }
+                None => self.take_all(),
+            };
+
+            if key == "host" {
+                self.host_param(value)?;
+            } else {
+                let value = self.decode(value)?;
+                self.config.param(&key, &value)?;
+            }
+        }
+
+        Ok(())
+    }
+
+    fn host_param(&mut self, s: &str) -> Result<(), Error> {
+        let s = self.decode(s)?;
+        self.config.param("host", &s)
+    }
+
+    fn decode(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
+        percent_encoding::percent_decode(s.as_bytes())
+            .decode_utf8()
+            .map_err(|e| Error::config_parse(e.into()))
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs
new file mode 100644
index 0000000000..7517fe0cde
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -0,0 +1,112 @@
+use crate::client::SocketConfig;
+use crate::config::{Host, TargetSessionAttrs};
+use crate::connect_raw::connect_raw;
+use crate::connect_socket::connect_socket;
+use crate::tls::{MakeTlsConnect, TlsConnect};
+use crate::{Client, Config, Connection, Error, SimpleQueryMessage};
+use futures_util::{future, pin_mut, Future, FutureExt, Stream};
+use std::io;
+use std::task::Poll;
+use tokio::net::TcpStream;
+
+pub async fn connect<T>(
+    mut tls: T,
+    config: &Config,
+) -> Result<(Client, Connection<TcpStream, T::Stream>), Error>
+where
+    T: MakeTlsConnect<TcpStream>,
+{
+    if config.host.is_empty() {
+        return Err(Error::config("host missing".into()));
+    }
+
+    if config.port.len() > 1 && config.port.len() != config.host.len() {
+        return Err(Error::config("invalid number of ports".into()));
+    }
+
+    let mut error = None;
+    for (i, host) in config.host.iter().enumerate() {
+        let port = config
+            .port
+            .get(i)
+            .or_else(|| config.port.first())
+            .copied()
+            .unwrap_or(5432);
+
+        let hostname = match host {
+            Host::Tcp(host) => host.as_str(),
+        };
+
+        let tls = tls
+            .make_tls_connect(hostname)
+            .map_err(|e| Error::tls(e.into()))?;
+
+        match connect_once(host, port, tls, config).await {
+            Ok((client, connection)) => return Ok((client, connection)),
+            Err(e) => error = Some(e),
+        }
+    }
+
+    Err(error.unwrap())
+}
+
+async fn connect_once<T>(
+    host: &Host,
+    port: u16,
+    tls: T,
+    config: &Config,
+) -> Result<(Client, Connection<TcpStream, T::Stream>), Error>
+where
+    T: TlsConnect<TcpStream>,
+{
+    let socket = connect_socket(host, port, config.connect_timeout).await?;
+    let (mut client, mut connection) = connect_raw(socket, tls, config).await?;
+
+    if let TargetSessionAttrs::ReadWrite = config.target_session_attrs {
+        let rows = client.simple_query_raw("SHOW transaction_read_only");
+        pin_mut!(rows);
+
+        let rows = future::poll_fn(|cx| {
+            if connection.poll_unpin(cx)?.is_ready() {
+                return Poll::Ready(Err(Error::closed()));
+            }
+
+            rows.as_mut().poll(cx)
+        })
+        .await?;
+        pin_mut!(rows);
+
+        loop {
+            let next = future::poll_fn(|cx| {
+                if connection.poll_unpin(cx)?.is_ready() {
+                    return Poll::Ready(Some(Err(Error::closed())));
+                }
+
+                rows.as_mut().poll_next(cx)
+            });
+
+            match next.await.transpose()? {
+                Some(SimpleQueryMessage::Row(row)) => {
+                    if row.try_get(0)? == Some("on") {
+                        return Err(Error::connect(io::Error::new(
+                            io::ErrorKind::PermissionDenied,
+                            "database does not allow writes",
+                        )));
+                    } else {
+                        break;
+                    }
+                }
+                Some(_) => {}
+                None => return Err(Error::unexpected_message()),
+            }
+        }
+    }
+
+    client.set_socket_config(SocketConfig {
+        host: host.clone(),
+        port,
+        connect_timeout: config.connect_timeout,
+    });
+
+    Ok((client, connection))
+}
diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs
new file mode 100644
index 0000000000..80677af969
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs
@@ -0,0 +1,359 @@
+use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
+use crate::config::{self, AuthKeys, Config, ReplicationMode};
+use crate::connect_tls::connect_tls;
+use crate::maybe_tls_stream::MaybeTlsStream;
+use crate::tls::{TlsConnect, TlsStream};
+use crate::{Client, Connection, Error};
+use bytes::BytesMut;
+use fallible_iterator::FallibleIterator;
+use futures_util::{ready, Sink, SinkExt, Stream, TryStreamExt};
+use postgres_protocol2::authentication;
+use postgres_protocol2::authentication::sasl;
+use postgres_protocol2::authentication::sasl::ScramSha256;
+use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message};
+use postgres_protocol2::message::frontend;
+use std::collections::{HashMap, VecDeque};
+use std::io;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::sync::mpsc;
+use tokio_util::codec::Framed;
+
+pub struct StartupStream<S, T> {
+    inner: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
+    buf: BackendMessages,
+    delayed: VecDeque<BackendMessage>,
+}
+
+impl<S, T> Sink<FrontendMessage> for StartupStream<S, T>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: AsyncRead + AsyncWrite + Unpin,
+{
+    type Error = io::Error;
+
+    fn poll_ready(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        Pin::new(&mut self.inner).poll_ready(cx)
+    }
+
+    fn start_send(mut self: Pin<&mut Self>, item: FrontendMessage) -> io::Result<()> {
+        Pin::new(&mut self.inner).start_send(item)
+    }
+
+    fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        Pin::new(&mut self.inner).poll_flush(cx)
+    }
+
+    fn poll_close(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        Pin::new(&mut self.inner).poll_close(cx)
+    }
+}
+
+impl<S, T> Stream for StartupStream<S, T>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: AsyncRead + AsyncWrite + Unpin,
+{
+    type Item = io::Result<Message>;
+
+    fn poll_next(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<io::Result<Message>>> {
+        loop {
+            match self.buf.next() {
+                Ok(Some(message)) => return Poll::Ready(Some(Ok(message))),
+                Ok(None) => {}
+                Err(e) => return Poll::Ready(Some(Err(e))),
+            }
+
+            match ready!(Pin::new(&mut self.inner).poll_next(cx)) {
+                Some(Ok(BackendMessage::Normal { messages, .. })) => self.buf = messages,
+                Some(Ok(BackendMessage::Async(message))) => return Poll::Ready(Some(Ok(message))),
+                Some(Err(e)) => return Poll::Ready(Some(Err(e))),
+                None => return Poll::Ready(None),
+            }
+        }
+    }
+}
+
+pub async fn connect_raw<S, T>(
+    stream: S,
+    tls: T,
+    config: &Config,
+) -> Result<(Client, Connection<S, T::Stream>), Error>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: TlsConnect<S>,
+{
+    let stream = connect_tls(stream, config.ssl_mode, tls).await?;
+
+    let mut stream = StartupStream {
+        inner: Framed::new(
+            stream,
+            PostgresCodec {
+                max_message_size: config.max_backend_message_size,
+            },
+        ),
+        buf: BackendMessages::empty(),
+        delayed: VecDeque::new(),
+    };
+
+    startup(&mut stream, config).await?;
+    authenticate(&mut stream, config).await?;
+    let (process_id, secret_key, parameters) = read_info(&mut stream).await?;
+
+    let (sender, receiver) = mpsc::unbounded_channel();
+    let client = Client::new(sender, config.ssl_mode, process_id, secret_key);
+    let connection = Connection::new(stream.inner, stream.delayed, parameters, receiver);
+
+    Ok((client, connection))
+}
+
+async fn startup<S, T>(stream: &mut StartupStream<S, T>, config: &Config) -> Result<(), Error>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: AsyncRead + AsyncWrite + Unpin,
+{
+    let mut params = vec![("client_encoding", "UTF8")];
+    if let Some(user) = &config.user {
+        params.push(("user", &**user));
+    }
+    if let Some(dbname) = &config.dbname {
+        params.push(("database", &**dbname));
+    }
+    if let Some(options) = &config.options {
+        params.push(("options", &**options));
+    }
+    if let Some(application_name) = &config.application_name {
+        params.push(("application_name", &**application_name));
+    }
+    if let Some(replication_mode) = &config.replication_mode {
+        match replication_mode {
+            ReplicationMode::Physical => params.push(("replication", "true")),
+            ReplicationMode::Logical => params.push(("replication", "database")),
+        }
+    }
+
+    let mut buf = BytesMut::new();
+    frontend::startup_message(params, &mut buf).map_err(Error::encode)?;
+
+    stream
+        .send(FrontendMessage::Raw(buf.freeze()))
+        .await
+        .map_err(Error::io)
+}
+
+async fn authenticate<S, T>(stream: &mut StartupStream<S, T>, config: &Config) -> Result<(), Error>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: TlsStream + Unpin,
+{
+    match stream.try_next().await.map_err(Error::io)? {
+        Some(Message::AuthenticationOk) => {
+            can_skip_channel_binding(config)?;
+            return Ok(());
+        }
+        Some(Message::AuthenticationCleartextPassword) => {
+            can_skip_channel_binding(config)?;
+
+            let pass = config
+                .password
+                .as_ref()
+                .ok_or_else(|| Error::config("password missing".into()))?;
+
+            authenticate_password(stream, pass).await?;
+        }
+        Some(Message::AuthenticationMd5Password(body)) => {
+            can_skip_channel_binding(config)?;
+
+            let user = config
+                .user
+                .as_ref()
+                .ok_or_else(|| Error::config("user missing".into()))?;
+            let pass = config
+                .password
+                .as_ref()
+                .ok_or_else(|| Error::config("password missing".into()))?;
+
+            let output = authentication::md5_hash(user.as_bytes(), pass, body.salt());
+            authenticate_password(stream, output.as_bytes()).await?;
+        }
+        Some(Message::AuthenticationSasl(body)) => {
+            authenticate_sasl(stream, body, config).await?;
+        }
+        Some(Message::AuthenticationKerberosV5)
+        | Some(Message::AuthenticationScmCredential)
+        | Some(Message::AuthenticationGss)
+        | Some(Message::AuthenticationSspi) => {
+            return Err(Error::authentication(
+                "unsupported authentication method".into(),
+            ))
+        }
+        Some(Message::ErrorResponse(body)) => return Err(Error::db(body)),
+        Some(_) => return Err(Error::unexpected_message()),
+        None => return Err(Error::closed()),
+    }
+
+    match stream.try_next().await.map_err(Error::io)? {
+        Some(Message::AuthenticationOk) => Ok(()),
+        Some(Message::ErrorResponse(body)) => Err(Error::db(body)),
+        Some(_) => Err(Error::unexpected_message()),
+        None => Err(Error::closed()),
+    }
+}
+
+fn can_skip_channel_binding(config: &Config) -> Result<(), Error> {
+    match config.channel_binding {
+        config::ChannelBinding::Disable | config::ChannelBinding::Prefer => Ok(()),
+        config::ChannelBinding::Require => Err(Error::authentication(
+            "server did not use channel binding".into(),
+        )),
+    }
+}
+
+async fn authenticate_password<S, T>(
+    stream: &mut StartupStream<S, T>,
+    password: &[u8],
+) -> Result<(), Error>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: AsyncRead + AsyncWrite + Unpin,
+{
+    let mut buf = BytesMut::new();
+    frontend::password_message(password, &mut buf).map_err(Error::encode)?;
+
+    stream
+        .send(FrontendMessage::Raw(buf.freeze()))
+        .await
+        .map_err(Error::io)
+}
+
+async fn authenticate_sasl<S, T>(
+    stream: &mut StartupStream<S, T>,
+    body: AuthenticationSaslBody,
+    config: &Config,
+) -> Result<(), Error>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: TlsStream + Unpin,
+{
+    let mut has_scram = false;
+    let mut has_scram_plus = false;
+    let mut mechanisms = body.mechanisms();
+    while let Some(mechanism) = mechanisms.next().map_err(Error::parse)? {
+        match mechanism {
+            sasl::SCRAM_SHA_256 => has_scram = true,
+            sasl::SCRAM_SHA_256_PLUS => has_scram_plus = true,
+            _ => {}
+        }
+    }
+
+    let channel_binding = stream
+        .inner
+        .get_ref()
+        .channel_binding()
+        .tls_server_end_point
+        .filter(|_| config.channel_binding != config::ChannelBinding::Disable)
+        .map(sasl::ChannelBinding::tls_server_end_point);
+
+    let (channel_binding, mechanism) = if has_scram_plus {
+        match channel_binding {
+            Some(channel_binding) => (channel_binding, sasl::SCRAM_SHA_256_PLUS),
+            None => (sasl::ChannelBinding::unsupported(), sasl::SCRAM_SHA_256),
+        }
+    } else if has_scram {
+        match channel_binding {
+            Some(_) => (sasl::ChannelBinding::unrequested(), sasl::SCRAM_SHA_256),
+            None => (sasl::ChannelBinding::unsupported(), sasl::SCRAM_SHA_256),
+        }
+    } else {
+        return Err(Error::authentication("unsupported SASL mechanism".into()));
+    };
+
+    if mechanism != sasl::SCRAM_SHA_256_PLUS {
+        can_skip_channel_binding(config)?;
+    }
+
+    let mut scram = if let Some(AuthKeys::ScramSha256(keys)) = config.get_auth_keys() {
+        ScramSha256::new_with_keys(keys, channel_binding)
+    } else if let Some(password) = config.get_password() {
+        ScramSha256::new(password, channel_binding)
+    } else {
+        return Err(Error::config("password or auth keys missing".into()));
+    };
+
+    let mut buf = BytesMut::new();
+    frontend::sasl_initial_response(mechanism, scram.message(), &mut buf).map_err(Error::encode)?;
+    stream
+        .send(FrontendMessage::Raw(buf.freeze()))
+        .await
+        .map_err(Error::io)?;
+
+    let body = match stream.try_next().await.map_err(Error::io)? {
+        Some(Message::AuthenticationSaslContinue(body)) => body,
+        Some(Message::ErrorResponse(body)) => return Err(Error::db(body)),
+        Some(_) => return Err(Error::unexpected_message()),
+        None => return Err(Error::closed()),
+    };
+
+    scram
+        .update(body.data())
+        .await
+        .map_err(|e| Error::authentication(e.into()))?;
+
+    let mut buf = BytesMut::new();
+    frontend::sasl_response(scram.message(), &mut buf).map_err(Error::encode)?;
+    stream
+        .send(FrontendMessage::Raw(buf.freeze()))
+        .await
+        .map_err(Error::io)?;
+
+    let body = match stream.try_next().await.map_err(Error::io)? {
+        Some(Message::AuthenticationSaslFinal(body)) => body,
+        Some(Message::ErrorResponse(body)) => return Err(Error::db(body)),
+        Some(_) => return Err(Error::unexpected_message()),
+        None => return Err(Error::closed()),
+    };
+
+    scram
+        .finish(body.data())
+        .map_err(|e| Error::authentication(e.into()))?;
+
+    Ok(())
+}
+
+async fn read_info<S, T>(
+    stream: &mut StartupStream<S, T>,
+) -> Result<(i32, i32, HashMap<String, String>), Error>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: AsyncRead + AsyncWrite + Unpin,
+{
+    let mut process_id = 0;
+    let mut secret_key = 0;
+    let mut parameters = HashMap::new();
+
+    loop {
+        match stream.try_next().await.map_err(Error::io)? {
+            Some(Message::BackendKeyData(body)) => {
+                process_id = body.process_id();
+                secret_key = body.secret_key();
+            }
+            Some(Message::ParameterStatus(body)) => {
+                parameters.insert(
+                    body.name().map_err(Error::parse)?.to_string(),
+                    body.value().map_err(Error::parse)?.to_string(),
+                );
+            }
+            Some(msg @ Message::NoticeResponse(_)) => {
+                stream.delayed.push_back(BackendMessage::Async(msg))
+            }
+            Some(Message::ReadyForQuery(_)) => return Ok((process_id, secret_key, parameters)),
+            Some(Message::ErrorResponse(body)) => return Err(Error::db(body)),
+            Some(_) => return Err(Error::unexpected_message()),
+            None => return Err(Error::closed()),
+        }
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/connect_socket.rs b/libs/proxy/tokio-postgres2/src/connect_socket.rs
new file mode 100644
index 0000000000..336a13317f
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/connect_socket.rs
@@ -0,0 +1,65 @@
+use crate::config::Host;
+use crate::Error;
+use std::future::Future;
+use std::io;
+use std::time::Duration;
+use tokio::net::{self, TcpStream};
+use tokio::time;
+
+pub(crate) async fn connect_socket(
+    host: &Host,
+    port: u16,
+    connect_timeout: Option<Duration>,
+) -> Result<TcpStream, Error> {
+    match host {
+        Host::Tcp(host) => {
+            let addrs = net::lookup_host((&**host, port))
+                .await
+                .map_err(Error::connect)?;
+
+            let mut last_err = None;
+
+            for addr in addrs {
+                let stream =
+                    match connect_with_timeout(TcpStream::connect(addr), connect_timeout).await {
+                        Ok(stream) => stream,
+                        Err(e) => {
+                            last_err = Some(e);
+                            continue;
+                        }
+                    };
+
+                stream.set_nodelay(true).map_err(Error::connect)?;
+
+                return Ok(stream);
+            }
+
+            Err(last_err.unwrap_or_else(|| {
+                Error::connect(io::Error::new(
+                    io::ErrorKind::InvalidInput,
+                    "could not resolve any addresses",
+                ))
+            }))
+        }
+    }
+}
+
+async fn connect_with_timeout<F, T>(connect: F, timeout: Option<Duration>) -> Result<T, Error>
+where
+    F: Future<Output = io::Result<T>>,
+{
+    match timeout {
+        Some(timeout) => match time::timeout(timeout, connect).await {
+            Ok(Ok(socket)) => Ok(socket),
+            Ok(Err(e)) => Err(Error::connect(e)),
+            Err(_) => Err(Error::connect(io::Error::new(
+                io::ErrorKind::TimedOut,
+                "connection timed out",
+            ))),
+        },
+        None => match connect.await {
+            Ok(socket) => Ok(socket),
+            Err(e) => Err(Error::connect(e)),
+        },
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/connect_tls.rs b/libs/proxy/tokio-postgres2/src/connect_tls.rs
new file mode 100644
index 0000000000..64b0b68abc
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/connect_tls.rs
@@ -0,0 +1,48 @@
+use crate::config::SslMode;
+use crate::maybe_tls_stream::MaybeTlsStream;
+use crate::tls::private::ForcePrivateApi;
+use crate::tls::TlsConnect;
+use crate::Error;
+use bytes::BytesMut;
+use postgres_protocol2::message::frontend;
+use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
+
+pub async fn connect_tls<S, T>(
+    mut stream: S,
+    mode: SslMode,
+    tls: T,
+) -> Result<MaybeTlsStream<S, T::Stream>, Error>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: TlsConnect<S>,
+{
+    match mode {
+        SslMode::Disable => return Ok(MaybeTlsStream::Raw(stream)),
+        SslMode::Prefer if !tls.can_connect(ForcePrivateApi) => {
+            return Ok(MaybeTlsStream::Raw(stream))
+        }
+        SslMode::Prefer | SslMode::Require => {}
+    }
+
+    let mut buf = BytesMut::new();
+    frontend::ssl_request(&mut buf);
+    stream.write_all(&buf).await.map_err(Error::io)?;
+
+    let mut buf = [0];
+    stream.read_exact(&mut buf).await.map_err(Error::io)?;
+
+    if buf[0] != b'S' {
+        if SslMode::Require == mode {
+            return Err(Error::tls("server does not support TLS".into()));
+        } else {
+            return Ok(MaybeTlsStream::Raw(stream));
+        }
+    }
+
+    let stream = tls
+        .connect(stream)
+        .await
+        .map_err(|e| Error::tls(e.into()))?;
+
+    Ok(MaybeTlsStream::Tls(stream))
+}
diff --git a/libs/proxy/tokio-postgres2/src/connection.rs b/libs/proxy/tokio-postgres2/src/connection.rs
new file mode 100644
index 0000000000..0aa5c77e22
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/connection.rs
@@ -0,0 +1,323 @@
+use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
+use crate::error::DbError;
+use crate::maybe_tls_stream::MaybeTlsStream;
+use crate::{AsyncMessage, Error, Notification};
+use bytes::BytesMut;
+use fallible_iterator::FallibleIterator;
+use futures_util::{ready, Sink, Stream};
+use log::{info, trace};
+use postgres_protocol2::message::backend::Message;
+use postgres_protocol2::message::frontend;
+use std::collections::{HashMap, VecDeque};
+use std::future::Future;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::sync::mpsc;
+use tokio_util::codec::Framed;
+use tokio_util::sync::PollSender;
+
+pub enum RequestMessages {
+    Single(FrontendMessage),
+}
+
+pub struct Request {
+    pub messages: RequestMessages,
+    pub sender: mpsc::Sender<BackendMessages>,
+}
+
+pub struct Response {
+    sender: PollSender<BackendMessages>,
+}
+
+#[derive(PartialEq, Debug)]
+enum State {
+    Active,
+    Terminating,
+    Closing,
+}
+
+/// A connection to a PostgreSQL database.
+///
+/// This is one half of what is returned when a new connection is established. It performs the actual IO with the
+/// server, and should generally be spawned off onto an executor to run in the background.
+///
+/// `Connection` implements `Future`, and only resolves when the connection is closed, either because a fatal error has
+/// occurred, or because its associated `Client` has dropped and all outstanding work has completed.
+#[must_use = "futures do nothing unless polled"]
+pub struct Connection<S, T> {
+    /// HACK: we need this in the Neon Proxy.
+    pub stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
+    /// HACK: we need this in the Neon Proxy to forward params.
+    pub parameters: HashMap<String, String>,
+    receiver: mpsc::UnboundedReceiver<Request>,
+    pending_request: Option<RequestMessages>,
+    pending_responses: VecDeque<BackendMessage>,
+    responses: VecDeque<Response>,
+    state: State,
+}
+
+impl<S, T> Connection<S, T>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: AsyncRead + AsyncWrite + Unpin,
+{
+    pub(crate) fn new(
+        stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
+        pending_responses: VecDeque<BackendMessage>,
+        parameters: HashMap<String, String>,
+        receiver: mpsc::UnboundedReceiver<Request>,
+    ) -> Connection<S, T> {
+        Connection {
+            stream,
+            parameters,
+            receiver,
+            pending_request: None,
+            pending_responses,
+            responses: VecDeque::new(),
+            state: State::Active,
+        }
+    }
+
+    fn poll_response(
+        &mut self,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<BackendMessage, Error>>> {
+        if let Some(message) = self.pending_responses.pop_front() {
+            trace!("retrying pending response");
+            return Poll::Ready(Some(Ok(message)));
+        }
+
+        Pin::new(&mut self.stream)
+            .poll_next(cx)
+            .map(|o| o.map(|r| r.map_err(Error::io)))
+    }
+
+    fn poll_read(&mut self, cx: &mut Context<'_>) -> Result<Option<AsyncMessage>, Error> {
+        if self.state != State::Active {
+            trace!("poll_read: done");
+            return Ok(None);
+        }
+
+        loop {
+            let message = match self.poll_response(cx)? {
+                Poll::Ready(Some(message)) => message,
+                Poll::Ready(None) => return Err(Error::closed()),
+                Poll::Pending => {
+                    trace!("poll_read: waiting on response");
+                    return Ok(None);
+                }
+            };
+
+            let (mut messages, request_complete) = match message {
+                BackendMessage::Async(Message::NoticeResponse(body)) => {
+                    let error = DbError::parse(&mut body.fields()).map_err(Error::parse)?;
+                    return Ok(Some(AsyncMessage::Notice(error)));
+                }
+                BackendMessage::Async(Message::NotificationResponse(body)) => {
+                    let notification = Notification {
+                        process_id: body.process_id(),
+                        channel: body.channel().map_err(Error::parse)?.to_string(),
+                        payload: body.message().map_err(Error::parse)?.to_string(),
+                    };
+                    return Ok(Some(AsyncMessage::Notification(notification)));
+                }
+                BackendMessage::Async(Message::ParameterStatus(body)) => {
+                    self.parameters.insert(
+                        body.name().map_err(Error::parse)?.to_string(),
+                        body.value().map_err(Error::parse)?.to_string(),
+                    );
+                    continue;
+                }
+                BackendMessage::Async(_) => unreachable!(),
+                BackendMessage::Normal {
+                    messages,
+                    request_complete,
+                } => (messages, request_complete),
+            };
+
+            let mut response = match self.responses.pop_front() {
+                Some(response) => response,
+                None => match messages.next().map_err(Error::parse)? {
+                    Some(Message::ErrorResponse(error)) => return Err(Error::db(error)),
+                    _ => return Err(Error::unexpected_message()),
+                },
+            };
+
+            match response.sender.poll_reserve(cx) {
+                Poll::Ready(Ok(())) => {
+                    let _ = response.sender.send_item(messages);
+                    if !request_complete {
+                        self.responses.push_front(response);
+                    }
+                }
+                Poll::Ready(Err(_)) => {
+                    // we need to keep paging through the rest of the messages even if the receiver's hung up
+                    if !request_complete {
+                        self.responses.push_front(response);
+                    }
+                }
+                Poll::Pending => {
+                    self.responses.push_front(response);
+                    self.pending_responses.push_back(BackendMessage::Normal {
+                        messages,
+                        request_complete,
+                    });
+                    trace!("poll_read: waiting on sender");
+                    return Ok(None);
+                }
+            }
+        }
+    }
+
+    fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll<Option<RequestMessages>> {
+        if let Some(messages) = self.pending_request.take() {
+            trace!("retrying pending request");
+            return Poll::Ready(Some(messages));
+        }
+
+        if self.receiver.is_closed() {
+            return Poll::Ready(None);
+        }
+
+        match self.receiver.poll_recv(cx) {
+            Poll::Ready(Some(request)) => {
+                trace!("polled new request");
+                self.responses.push_back(Response {
+                    sender: PollSender::new(request.sender),
+                });
+                Poll::Ready(Some(request.messages))
+            }
+            Poll::Ready(None) => Poll::Ready(None),
+            Poll::Pending => Poll::Pending,
+        }
+    }
+
+    fn poll_write(&mut self, cx: &mut Context<'_>) -> Result<bool, Error> {
+        loop {
+            if self.state == State::Closing {
+                trace!("poll_write: done");
+                return Ok(false);
+            }
+
+            if Pin::new(&mut self.stream)
+                .poll_ready(cx)
+                .map_err(Error::io)?
+                .is_pending()
+            {
+                trace!("poll_write: waiting on socket");
+                return Ok(false);
+            }
+
+            let request = match self.poll_request(cx) {
+                Poll::Ready(Some(request)) => request,
+                Poll::Ready(None) if self.responses.is_empty() && self.state == State::Active => {
+                    trace!("poll_write: at eof, terminating");
+                    self.state = State::Terminating;
+                    let mut request = BytesMut::new();
+                    frontend::terminate(&mut request);
+                    RequestMessages::Single(FrontendMessage::Raw(request.freeze()))
+                }
+                Poll::Ready(None) => {
+                    trace!(
+                        "poll_write: at eof, pending responses {}",
+                        self.responses.len()
+                    );
+                    return Ok(true);
+                }
+                Poll::Pending => {
+                    trace!("poll_write: waiting on request");
+                    return Ok(true);
+                }
+            };
+
+            match request {
+                RequestMessages::Single(request) => {
+                    Pin::new(&mut self.stream)
+                        .start_send(request)
+                        .map_err(Error::io)?;
+                    if self.state == State::Terminating {
+                        trace!("poll_write: sent eof, closing");
+                        self.state = State::Closing;
+                    }
+                }
+            }
+        }
+    }
+
+    fn poll_flush(&mut self, cx: &mut Context<'_>) -> Result<(), Error> {
+        match Pin::new(&mut self.stream)
+            .poll_flush(cx)
+            .map_err(Error::io)?
+        {
+            Poll::Ready(()) => trace!("poll_flush: flushed"),
+            Poll::Pending => trace!("poll_flush: waiting on socket"),
+        }
+        Ok(())
+    }
+
+    fn poll_shutdown(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
+        if self.state != State::Closing {
+            return Poll::Pending;
+        }
+
+        match Pin::new(&mut self.stream)
+            .poll_close(cx)
+            .map_err(Error::io)?
+        {
+            Poll::Ready(()) => {
+                trace!("poll_shutdown: complete");
+                Poll::Ready(Ok(()))
+            }
+            Poll::Pending => {
+                trace!("poll_shutdown: waiting on socket");
+                Poll::Pending
+            }
+        }
+    }
+
+    /// Returns the value of a runtime parameter for this connection.
+    pub fn parameter(&self, name: &str) -> Option<&str> {
+        self.parameters.get(name).map(|s| &**s)
+    }
+
+    /// Polls for asynchronous messages from the server.
+    ///
+    /// The server can send notices as well as notifications asynchronously to the client. Applications that wish to
+    /// examine those messages should use this method to drive the connection rather than its `Future` implementation.
+    pub fn poll_message(
+        &mut self,
+        cx: &mut Context<'_>,
+    ) -> Poll<Option<Result<AsyncMessage, Error>>> {
+        let message = self.poll_read(cx)?;
+        let want_flush = self.poll_write(cx)?;
+        if want_flush {
+            self.poll_flush(cx)?;
+        }
+        match message {
+            Some(message) => Poll::Ready(Some(Ok(message))),
+            None => match self.poll_shutdown(cx) {
+                Poll::Ready(Ok(())) => Poll::Ready(None),
+                Poll::Ready(Err(e)) => Poll::Ready(Some(Err(e))),
+                Poll::Pending => Poll::Pending,
+            },
+        }
+    }
+}
+
+impl<S, T> Future for Connection<S, T>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: AsyncRead + AsyncWrite + Unpin,
+{
+    type Output = Result<(), Error>;
+
+    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Result<(), Error>> {
+        while let Some(message) = ready!(self.poll_message(cx)?) {
+            if let AsyncMessage::Notice(notice) = message {
+                info!("{}: {}", notice.severity(), notice.message());
+            }
+        }
+        Poll::Ready(Ok(()))
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/error/mod.rs b/libs/proxy/tokio-postgres2/src/error/mod.rs
new file mode 100644
index 0000000000..6514322250
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/error/mod.rs
@@ -0,0 +1,501 @@
+//! Errors.
+
+use fallible_iterator::FallibleIterator;
+use postgres_protocol2::message::backend::{ErrorFields, ErrorResponseBody};
+use std::error::{self, Error as _Error};
+use std::fmt;
+use std::io;
+
+pub use self::sqlstate::*;
+
+#[allow(clippy::unreadable_literal)]
+mod sqlstate;
+
+/// The severity of a Postgres error or notice.
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum Severity {
+    /// PANIC
+    Panic,
+    /// FATAL
+    Fatal,
+    /// ERROR
+    Error,
+    /// WARNING
+    Warning,
+    /// NOTICE
+    Notice,
+    /// DEBUG
+    Debug,
+    /// INFO
+    Info,
+    /// LOG
+    Log,
+}
+
+impl fmt::Display for Severity {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        let s = match *self {
+            Severity::Panic => "PANIC",
+            Severity::Fatal => "FATAL",
+            Severity::Error => "ERROR",
+            Severity::Warning => "WARNING",
+            Severity::Notice => "NOTICE",
+            Severity::Debug => "DEBUG",
+            Severity::Info => "INFO",
+            Severity::Log => "LOG",
+        };
+        fmt.write_str(s)
+    }
+}
+
+impl Severity {
+    fn from_str(s: &str) -> Option<Severity> {
+        match s {
+            "PANIC" => Some(Severity::Panic),
+            "FATAL" => Some(Severity::Fatal),
+            "ERROR" => Some(Severity::Error),
+            "WARNING" => Some(Severity::Warning),
+            "NOTICE" => Some(Severity::Notice),
+            "DEBUG" => Some(Severity::Debug),
+            "INFO" => Some(Severity::Info),
+            "LOG" => Some(Severity::Log),
+            _ => None,
+        }
+    }
+}
+
+/// A Postgres error or notice.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct DbError {
+    severity: String,
+    parsed_severity: Option<Severity>,
+    code: SqlState,
+    message: String,
+    detail: Option<String>,
+    hint: Option<String>,
+    position: Option<ErrorPosition>,
+    where_: Option<String>,
+    schema: Option<String>,
+    table: Option<String>,
+    column: Option<String>,
+    datatype: Option<String>,
+    constraint: Option<String>,
+    file: Option<String>,
+    line: Option<u32>,
+    routine: Option<String>,
+}
+
+impl DbError {
+    pub(crate) fn parse(fields: &mut ErrorFields<'_>) -> io::Result<DbError> {
+        let mut severity = None;
+        let mut parsed_severity = None;
+        let mut code = None;
+        let mut message = None;
+        let mut detail = None;
+        let mut hint = None;
+        let mut normal_position = None;
+        let mut internal_position = None;
+        let mut internal_query = None;
+        let mut where_ = None;
+        let mut schema = None;
+        let mut table = None;
+        let mut column = None;
+        let mut datatype = None;
+        let mut constraint = None;
+        let mut file = None;
+        let mut line = None;
+        let mut routine = None;
+
+        while let Some(field) = fields.next()? {
+            match field.type_() {
+                b'S' => severity = Some(field.value().to_owned()),
+                b'C' => code = Some(SqlState::from_code(field.value())),
+                b'M' => message = Some(field.value().to_owned()),
+                b'D' => detail = Some(field.value().to_owned()),
+                b'H' => hint = Some(field.value().to_owned()),
+                b'P' => {
+                    normal_position = Some(field.value().parse::<u32>().map_err(|_| {
+                        io::Error::new(
+                            io::ErrorKind::InvalidInput,
+                            "`P` field did not contain an integer",
+                        )
+                    })?);
+                }
+                b'p' => {
+                    internal_position = Some(field.value().parse::<u32>().map_err(|_| {
+                        io::Error::new(
+                            io::ErrorKind::InvalidInput,
+                            "`p` field did not contain an integer",
+                        )
+                    })?);
+                }
+                b'q' => internal_query = Some(field.value().to_owned()),
+                b'W' => where_ = Some(field.value().to_owned()),
+                b's' => schema = Some(field.value().to_owned()),
+                b't' => table = Some(field.value().to_owned()),
+                b'c' => column = Some(field.value().to_owned()),
+                b'd' => datatype = Some(field.value().to_owned()),
+                b'n' => constraint = Some(field.value().to_owned()),
+                b'F' => file = Some(field.value().to_owned()),
+                b'L' => {
+                    line = Some(field.value().parse::<u32>().map_err(|_| {
+                        io::Error::new(
+                            io::ErrorKind::InvalidInput,
+                            "`L` field did not contain an integer",
+                        )
+                    })?);
+                }
+                b'R' => routine = Some(field.value().to_owned()),
+                b'V' => {
+                    parsed_severity = Some(Severity::from_str(field.value()).ok_or_else(|| {
+                        io::Error::new(
+                            io::ErrorKind::InvalidInput,
+                            "`V` field contained an invalid value",
+                        )
+                    })?);
+                }
+                _ => {}
+            }
+        }
+
+        Ok(DbError {
+            severity: severity
+                .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`S` field missing"))?,
+            parsed_severity,
+            code: code
+                .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`C` field missing"))?,
+            message: message
+                .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "`M` field missing"))?,
+            detail,
+            hint,
+            position: match normal_position {
+                Some(position) => Some(ErrorPosition::Original(position)),
+                None => match internal_position {
+                    Some(position) => Some(ErrorPosition::Internal {
+                        position,
+                        query: internal_query.ok_or_else(|| {
+                            io::Error::new(
+                                io::ErrorKind::InvalidInput,
+                                "`q` field missing but `p` field present",
+                            )
+                        })?,
+                    }),
+                    None => None,
+                },
+            },
+            where_,
+            schema,
+            table,
+            column,
+            datatype,
+            constraint,
+            file,
+            line,
+            routine,
+        })
+    }
+
+    /// The field contents are ERROR, FATAL, or PANIC (in an error message),
+    /// or WARNING, NOTICE, DEBUG, INFO, or LOG (in a notice message), or a
+    /// localized translation of one of these.
+    pub fn severity(&self) -> &str {
+        &self.severity
+    }
+
+    /// A parsed, nonlocalized version of `severity`. (PostgreSQL 9.6+)
+    pub fn parsed_severity(&self) -> Option<Severity> {
+        self.parsed_severity
+    }
+
+    /// The SQLSTATE code for the error.
+    pub fn code(&self) -> &SqlState {
+        &self.code
+    }
+
+    /// The primary human-readable error message.
+    ///
+    /// This should be accurate but terse (typically one line).
+    pub fn message(&self) -> &str {
+        &self.message
+    }
+
+    /// An optional secondary error message carrying more detail about the
+    /// problem.
+    ///
+    /// Might run to multiple lines.
+    pub fn detail(&self) -> Option<&str> {
+        self.detail.as_deref()
+    }
+
+    /// An optional suggestion what to do about the problem.
+    ///
+    /// This is intended to differ from `detail` in that it offers advice
+    /// (potentially inappropriate) rather than hard facts. Might run to
+    /// multiple lines.
+    pub fn hint(&self) -> Option<&str> {
+        self.hint.as_deref()
+    }
+
+    /// An optional error cursor position into either the original query string
+    /// or an internally generated query.
+    pub fn position(&self) -> Option<&ErrorPosition> {
+        self.position.as_ref()
+    }
+
+    /// An indication of the context in which the error occurred.
+    ///
+    /// Presently this includes a call stack traceback of active procedural
+    /// language functions and internally-generated queries. The trace is one
+    /// entry per line, most recent first.
+    pub fn where_(&self) -> Option<&str> {
+        self.where_.as_deref()
+    }
+
+    /// If the error was associated with a specific database object, the name
+    /// of the schema containing that object, if any. (PostgreSQL 9.3+)
+    pub fn schema(&self) -> Option<&str> {
+        self.schema.as_deref()
+    }
+
+    /// If the error was associated with a specific table, the name of the
+    /// table. (Refer to the schema name field for the name of the table's
+    /// schema.) (PostgreSQL 9.3+)
+    pub fn table(&self) -> Option<&str> {
+        self.table.as_deref()
+    }
+
+    /// If the error was associated with a specific table column, the name of
+    /// the column.
+    ///
+    /// (Refer to the schema and table name fields to identify the table.)
+    /// (PostgreSQL 9.3+)
+    pub fn column(&self) -> Option<&str> {
+        self.column.as_deref()
+    }
+
+    /// If the error was associated with a specific data type, the name of the
+    /// data type. (Refer to the schema name field for the name of the data
+    /// type's schema.) (PostgreSQL 9.3+)
+    pub fn datatype(&self) -> Option<&str> {
+        self.datatype.as_deref()
+    }
+
+    /// If the error was associated with a specific constraint, the name of the
+    /// constraint.
+    ///
+    /// Refer to fields listed above for the associated table or domain.
+    /// (For this purpose, indexes are treated as constraints, even if they
+    /// weren't created with constraint syntax.) (PostgreSQL 9.3+)
+    pub fn constraint(&self) -> Option<&str> {
+        self.constraint.as_deref()
+    }
+
+    /// The file name of the source-code location where the error was reported.
+    pub fn file(&self) -> Option<&str> {
+        self.file.as_deref()
+    }
+
+    /// The line number of the source-code location where the error was
+    /// reported.
+    pub fn line(&self) -> Option<u32> {
+        self.line
+    }
+
+    /// The name of the source-code routine reporting the error.
+    pub fn routine(&self) -> Option<&str> {
+        self.routine.as_deref()
+    }
+}
+
+impl fmt::Display for DbError {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(fmt, "{}: {}", self.severity, self.message)?;
+        if let Some(detail) = &self.detail {
+            write!(fmt, "\nDETAIL: {}", detail)?;
+        }
+        if let Some(hint) = &self.hint {
+            write!(fmt, "\nHINT: {}", hint)?;
+        }
+        Ok(())
+    }
+}
+
+impl error::Error for DbError {}
+
+/// Represents the position of an error in a query.
+#[derive(Clone, PartialEq, Eq, Debug)]
+pub enum ErrorPosition {
+    /// A position in the original query.
+    Original(u32),
+    /// A position in an internally generated query.
+    Internal {
+        /// The byte position.
+        position: u32,
+        /// A query generated by the Postgres server.
+        query: String,
+    },
+}
+
+#[derive(Debug, PartialEq)]
+enum Kind {
+    Io,
+    UnexpectedMessage,
+    Tls,
+    ToSql(usize),
+    FromSql(usize),
+    Column(String),
+    Closed,
+    Db,
+    Parse,
+    Encode,
+    Authentication,
+    ConfigParse,
+    Config,
+    Connect,
+    Timeout,
+}
+
+struct ErrorInner {
+    kind: Kind,
+    cause: Option<Box<dyn error::Error + Sync + Send>>,
+}
+
+/// An error communicating with the Postgres server.
+pub struct Error(Box<ErrorInner>);
+
+impl fmt::Debug for Error {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt.debug_struct("Error")
+            .field("kind", &self.0.kind)
+            .field("cause", &self.0.cause)
+            .finish()
+    }
+}
+
+impl fmt::Display for Error {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match &self.0.kind {
+            Kind::Io => fmt.write_str("error communicating with the server")?,
+            Kind::UnexpectedMessage => fmt.write_str("unexpected message from server")?,
+            Kind::Tls => fmt.write_str("error performing TLS handshake")?,
+            Kind::ToSql(idx) => write!(fmt, "error serializing parameter {}", idx)?,
+            Kind::FromSql(idx) => write!(fmt, "error deserializing column {}", idx)?,
+            Kind::Column(column) => write!(fmt, "invalid column `{}`", column)?,
+            Kind::Closed => fmt.write_str("connection closed")?,
+            Kind::Db => fmt.write_str("db error")?,
+            Kind::Parse => fmt.write_str("error parsing response from server")?,
+            Kind::Encode => fmt.write_str("error encoding message to server")?,
+            Kind::Authentication => fmt.write_str("authentication error")?,
+            Kind::ConfigParse => fmt.write_str("invalid connection string")?,
+            Kind::Config => fmt.write_str("invalid configuration")?,
+            Kind::Connect => fmt.write_str("error connecting to server")?,
+            Kind::Timeout => fmt.write_str("timeout waiting for server")?,
+        };
+        if let Some(ref cause) = self.0.cause {
+            write!(fmt, ": {}", cause)?;
+        }
+        Ok(())
+    }
+}
+
+impl error::Error for Error {
+    fn source(&self) -> Option<&(dyn error::Error + 'static)> {
+        self.0.cause.as_ref().map(|e| &**e as _)
+    }
+}
+
+impl Error {
+    /// Consumes the error, returning its cause.
+    pub fn into_source(self) -> Option<Box<dyn error::Error + Sync + Send>> {
+        self.0.cause
+    }
+
+    /// Returns the source of this error if it was a `DbError`.
+    ///
+    /// This is a simple convenience method.
+    pub fn as_db_error(&self) -> Option<&DbError> {
+        self.source().and_then(|e| e.downcast_ref::<DbError>())
+    }
+
+    /// Determines if the error was associated with closed connection.
+    pub fn is_closed(&self) -> bool {
+        self.0.kind == Kind::Closed
+    }
+
+    /// Returns the SQLSTATE error code associated with the error.
+    ///
+    /// This is a convenience method that downcasts the cause to a `DbError` and returns its code.
+    pub fn code(&self) -> Option<&SqlState> {
+        self.as_db_error().map(DbError::code)
+    }
+
+    fn new(kind: Kind, cause: Option<Box<dyn error::Error + Sync + Send>>) -> Error {
+        Error(Box::new(ErrorInner { kind, cause }))
+    }
+
+    pub(crate) fn closed() -> Error {
+        Error::new(Kind::Closed, None)
+    }
+
+    pub(crate) fn unexpected_message() -> Error {
+        Error::new(Kind::UnexpectedMessage, None)
+    }
+
+    #[allow(clippy::needless_pass_by_value)]
+    pub(crate) fn db(error: ErrorResponseBody) -> Error {
+        match DbError::parse(&mut error.fields()) {
+            Ok(e) => Error::new(Kind::Db, Some(Box::new(e))),
+            Err(e) => Error::new(Kind::Parse, Some(Box::new(e))),
+        }
+    }
+
+    pub(crate) fn parse(e: io::Error) -> Error {
+        Error::new(Kind::Parse, Some(Box::new(e)))
+    }
+
+    pub(crate) fn encode(e: io::Error) -> Error {
+        Error::new(Kind::Encode, Some(Box::new(e)))
+    }
+
+    #[allow(clippy::wrong_self_convention)]
+    pub(crate) fn to_sql(e: Box<dyn error::Error + Sync + Send>, idx: usize) -> Error {
+        Error::new(Kind::ToSql(idx), Some(e))
+    }
+
+    pub(crate) fn from_sql(e: Box<dyn error::Error + Sync + Send>, idx: usize) -> Error {
+        Error::new(Kind::FromSql(idx), Some(e))
+    }
+
+    pub(crate) fn column(column: String) -> Error {
+        Error::new(Kind::Column(column), None)
+    }
+
+    pub(crate) fn tls(e: Box<dyn error::Error + Sync + Send>) -> Error {
+        Error::new(Kind::Tls, Some(e))
+    }
+
+    pub(crate) fn io(e: io::Error) -> Error {
+        Error::new(Kind::Io, Some(Box::new(e)))
+    }
+
+    pub(crate) fn authentication(e: Box<dyn error::Error + Sync + Send>) -> Error {
+        Error::new(Kind::Authentication, Some(e))
+    }
+
+    pub(crate) fn config_parse(e: Box<dyn error::Error + Sync + Send>) -> Error {
+        Error::new(Kind::ConfigParse, Some(e))
+    }
+
+    pub(crate) fn config(e: Box<dyn error::Error + Sync + Send>) -> Error {
+        Error::new(Kind::Config, Some(e))
+    }
+
+    pub(crate) fn connect(e: io::Error) -> Error {
+        Error::new(Kind::Connect, Some(Box::new(e)))
+    }
+
+    #[doc(hidden)]
+    pub fn __private_api_timeout() -> Error {
+        Error::new(Kind::Timeout, None)
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/error/sqlstate.rs b/libs/proxy/tokio-postgres2/src/error/sqlstate.rs
new file mode 100644
index 0000000000..13a1d75f95
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/error/sqlstate.rs
@@ -0,0 +1,1670 @@
+// Autogenerated file - DO NOT EDIT
+
+/// A SQLSTATE error code
+#[derive(PartialEq, Eq, Clone, Debug)]
+pub struct SqlState(Inner);
+
+impl SqlState {
+    /// Creates a `SqlState` from its error code.
+    pub fn from_code(s: &str) -> SqlState {
+        match SQLSTATE_MAP.get(s) {
+            Some(state) => state.clone(),
+            None => SqlState(Inner::Other(s.into())),
+        }
+    }
+
+    /// Returns the error code corresponding to the `SqlState`.
+    pub fn code(&self) -> &str {
+        match &self.0 {
+            Inner::E00000 => "00000",
+            Inner::E01000 => "01000",
+            Inner::E0100C => "0100C",
+            Inner::E01008 => "01008",
+            Inner::E01003 => "01003",
+            Inner::E01007 => "01007",
+            Inner::E01006 => "01006",
+            Inner::E01004 => "01004",
+            Inner::E01P01 => "01P01",
+            Inner::E02000 => "02000",
+            Inner::E02001 => "02001",
+            Inner::E03000 => "03000",
+            Inner::E08000 => "08000",
+            Inner::E08003 => "08003",
+            Inner::E08006 => "08006",
+            Inner::E08001 => "08001",
+            Inner::E08004 => "08004",
+            Inner::E08007 => "08007",
+            Inner::E08P01 => "08P01",
+            Inner::E09000 => "09000",
+            Inner::E0A000 => "0A000",
+            Inner::E0B000 => "0B000",
+            Inner::E0F000 => "0F000",
+            Inner::E0F001 => "0F001",
+            Inner::E0L000 => "0L000",
+            Inner::E0LP01 => "0LP01",
+            Inner::E0P000 => "0P000",
+            Inner::E0Z000 => "0Z000",
+            Inner::E0Z002 => "0Z002",
+            Inner::E20000 => "20000",
+            Inner::E21000 => "21000",
+            Inner::E22000 => "22000",
+            Inner::E2202E => "2202E",
+            Inner::E22021 => "22021",
+            Inner::E22008 => "22008",
+            Inner::E22012 => "22012",
+            Inner::E22005 => "22005",
+            Inner::E2200B => "2200B",
+            Inner::E22022 => "22022",
+            Inner::E22015 => "22015",
+            Inner::E2201E => "2201E",
+            Inner::E22014 => "22014",
+            Inner::E22016 => "22016",
+            Inner::E2201F => "2201F",
+            Inner::E2201G => "2201G",
+            Inner::E22018 => "22018",
+            Inner::E22007 => "22007",
+            Inner::E22019 => "22019",
+            Inner::E2200D => "2200D",
+            Inner::E22025 => "22025",
+            Inner::E22P06 => "22P06",
+            Inner::E22010 => "22010",
+            Inner::E22023 => "22023",
+            Inner::E22013 => "22013",
+            Inner::E2201B => "2201B",
+            Inner::E2201W => "2201W",
+            Inner::E2201X => "2201X",
+            Inner::E2202H => "2202H",
+            Inner::E2202G => "2202G",
+            Inner::E22009 => "22009",
+            Inner::E2200C => "2200C",
+            Inner::E2200G => "2200G",
+            Inner::E22004 => "22004",
+            Inner::E22002 => "22002",
+            Inner::E22003 => "22003",
+            Inner::E2200H => "2200H",
+            Inner::E22026 => "22026",
+            Inner::E22001 => "22001",
+            Inner::E22011 => "22011",
+            Inner::E22027 => "22027",
+            Inner::E22024 => "22024",
+            Inner::E2200F => "2200F",
+            Inner::E22P01 => "22P01",
+            Inner::E22P02 => "22P02",
+            Inner::E22P03 => "22P03",
+            Inner::E22P04 => "22P04",
+            Inner::E22P05 => "22P05",
+            Inner::E2200L => "2200L",
+            Inner::E2200M => "2200M",
+            Inner::E2200N => "2200N",
+            Inner::E2200S => "2200S",
+            Inner::E2200T => "2200T",
+            Inner::E22030 => "22030",
+            Inner::E22031 => "22031",
+            Inner::E22032 => "22032",
+            Inner::E22033 => "22033",
+            Inner::E22034 => "22034",
+            Inner::E22035 => "22035",
+            Inner::E22036 => "22036",
+            Inner::E22037 => "22037",
+            Inner::E22038 => "22038",
+            Inner::E22039 => "22039",
+            Inner::E2203A => "2203A",
+            Inner::E2203B => "2203B",
+            Inner::E2203C => "2203C",
+            Inner::E2203D => "2203D",
+            Inner::E2203E => "2203E",
+            Inner::E2203F => "2203F",
+            Inner::E2203G => "2203G",
+            Inner::E23000 => "23000",
+            Inner::E23001 => "23001",
+            Inner::E23502 => "23502",
+            Inner::E23503 => "23503",
+            Inner::E23505 => "23505",
+            Inner::E23514 => "23514",
+            Inner::E23P01 => "23P01",
+            Inner::E24000 => "24000",
+            Inner::E25000 => "25000",
+            Inner::E25001 => "25001",
+            Inner::E25002 => "25002",
+            Inner::E25008 => "25008",
+            Inner::E25003 => "25003",
+            Inner::E25004 => "25004",
+            Inner::E25005 => "25005",
+            Inner::E25006 => "25006",
+            Inner::E25007 => "25007",
+            Inner::E25P01 => "25P01",
+            Inner::E25P02 => "25P02",
+            Inner::E25P03 => "25P03",
+            Inner::E26000 => "26000",
+            Inner::E27000 => "27000",
+            Inner::E28000 => "28000",
+            Inner::E28P01 => "28P01",
+            Inner::E2B000 => "2B000",
+            Inner::E2BP01 => "2BP01",
+            Inner::E2D000 => "2D000",
+            Inner::E2F000 => "2F000",
+            Inner::E2F005 => "2F005",
+            Inner::E2F002 => "2F002",
+            Inner::E2F003 => "2F003",
+            Inner::E2F004 => "2F004",
+            Inner::E34000 => "34000",
+            Inner::E38000 => "38000",
+            Inner::E38001 => "38001",
+            Inner::E38002 => "38002",
+            Inner::E38003 => "38003",
+            Inner::E38004 => "38004",
+            Inner::E39000 => "39000",
+            Inner::E39001 => "39001",
+            Inner::E39004 => "39004",
+            Inner::E39P01 => "39P01",
+            Inner::E39P02 => "39P02",
+            Inner::E39P03 => "39P03",
+            Inner::E3B000 => "3B000",
+            Inner::E3B001 => "3B001",
+            Inner::E3D000 => "3D000",
+            Inner::E3F000 => "3F000",
+            Inner::E40000 => "40000",
+            Inner::E40002 => "40002",
+            Inner::E40001 => "40001",
+            Inner::E40003 => "40003",
+            Inner::E40P01 => "40P01",
+            Inner::E42000 => "42000",
+            Inner::E42601 => "42601",
+            Inner::E42501 => "42501",
+            Inner::E42846 => "42846",
+            Inner::E42803 => "42803",
+            Inner::E42P20 => "42P20",
+            Inner::E42P19 => "42P19",
+            Inner::E42830 => "42830",
+            Inner::E42602 => "42602",
+            Inner::E42622 => "42622",
+            Inner::E42939 => "42939",
+            Inner::E42804 => "42804",
+            Inner::E42P18 => "42P18",
+            Inner::E42P21 => "42P21",
+            Inner::E42P22 => "42P22",
+            Inner::E42809 => "42809",
+            Inner::E428C9 => "428C9",
+            Inner::E42703 => "42703",
+            Inner::E42883 => "42883",
+            Inner::E42P01 => "42P01",
+            Inner::E42P02 => "42P02",
+            Inner::E42704 => "42704",
+            Inner::E42701 => "42701",
+            Inner::E42P03 => "42P03",
+            Inner::E42P04 => "42P04",
+            Inner::E42723 => "42723",
+            Inner::E42P05 => "42P05",
+            Inner::E42P06 => "42P06",
+            Inner::E42P07 => "42P07",
+            Inner::E42712 => "42712",
+            Inner::E42710 => "42710",
+            Inner::E42702 => "42702",
+            Inner::E42725 => "42725",
+            Inner::E42P08 => "42P08",
+            Inner::E42P09 => "42P09",
+            Inner::E42P10 => "42P10",
+            Inner::E42611 => "42611",
+            Inner::E42P11 => "42P11",
+            Inner::E42P12 => "42P12",
+            Inner::E42P13 => "42P13",
+            Inner::E42P14 => "42P14",
+            Inner::E42P15 => "42P15",
+            Inner::E42P16 => "42P16",
+            Inner::E42P17 => "42P17",
+            Inner::E44000 => "44000",
+            Inner::E53000 => "53000",
+            Inner::E53100 => "53100",
+            Inner::E53200 => "53200",
+            Inner::E53300 => "53300",
+            Inner::E53400 => "53400",
+            Inner::E54000 => "54000",
+            Inner::E54001 => "54001",
+            Inner::E54011 => "54011",
+            Inner::E54023 => "54023",
+            Inner::E55000 => "55000",
+            Inner::E55006 => "55006",
+            Inner::E55P02 => "55P02",
+            Inner::E55P03 => "55P03",
+            Inner::E55P04 => "55P04",
+            Inner::E57000 => "57000",
+            Inner::E57014 => "57014",
+            Inner::E57P01 => "57P01",
+            Inner::E57P02 => "57P02",
+            Inner::E57P03 => "57P03",
+            Inner::E57P04 => "57P04",
+            Inner::E57P05 => "57P05",
+            Inner::E58000 => "58000",
+            Inner::E58030 => "58030",
+            Inner::E58P01 => "58P01",
+            Inner::E58P02 => "58P02",
+            Inner::E72000 => "72000",
+            Inner::EF0000 => "F0000",
+            Inner::EF0001 => "F0001",
+            Inner::EHV000 => "HV000",
+            Inner::EHV005 => "HV005",
+            Inner::EHV002 => "HV002",
+            Inner::EHV010 => "HV010",
+            Inner::EHV021 => "HV021",
+            Inner::EHV024 => "HV024",
+            Inner::EHV007 => "HV007",
+            Inner::EHV008 => "HV008",
+            Inner::EHV004 => "HV004",
+            Inner::EHV006 => "HV006",
+            Inner::EHV091 => "HV091",
+            Inner::EHV00B => "HV00B",
+            Inner::EHV00C => "HV00C",
+            Inner::EHV00D => "HV00D",
+            Inner::EHV090 => "HV090",
+            Inner::EHV00A => "HV00A",
+            Inner::EHV009 => "HV009",
+            Inner::EHV014 => "HV014",
+            Inner::EHV001 => "HV001",
+            Inner::EHV00P => "HV00P",
+            Inner::EHV00J => "HV00J",
+            Inner::EHV00K => "HV00K",
+            Inner::EHV00Q => "HV00Q",
+            Inner::EHV00R => "HV00R",
+            Inner::EHV00L => "HV00L",
+            Inner::EHV00M => "HV00M",
+            Inner::EHV00N => "HV00N",
+            Inner::EP0000 => "P0000",
+            Inner::EP0001 => "P0001",
+            Inner::EP0002 => "P0002",
+            Inner::EP0003 => "P0003",
+            Inner::EP0004 => "P0004",
+            Inner::EXX000 => "XX000",
+            Inner::EXX001 => "XX001",
+            Inner::EXX002 => "XX002",
+            Inner::Other(code) => code,
+        }
+    }
+
+    /// 00000
+    pub const SUCCESSFUL_COMPLETION: SqlState = SqlState(Inner::E00000);
+
+    /// 01000
+    pub const WARNING: SqlState = SqlState(Inner::E01000);
+
+    /// 0100C
+    pub const WARNING_DYNAMIC_RESULT_SETS_RETURNED: SqlState = SqlState(Inner::E0100C);
+
+    /// 01008
+    pub const WARNING_IMPLICIT_ZERO_BIT_PADDING: SqlState = SqlState(Inner::E01008);
+
+    /// 01003
+    pub const WARNING_NULL_VALUE_ELIMINATED_IN_SET_FUNCTION: SqlState = SqlState(Inner::E01003);
+
+    /// 01007
+    pub const WARNING_PRIVILEGE_NOT_GRANTED: SqlState = SqlState(Inner::E01007);
+
+    /// 01006
+    pub const WARNING_PRIVILEGE_NOT_REVOKED: SqlState = SqlState(Inner::E01006);
+
+    /// 01004
+    pub const WARNING_STRING_DATA_RIGHT_TRUNCATION: SqlState = SqlState(Inner::E01004);
+
+    /// 01P01
+    pub const WARNING_DEPRECATED_FEATURE: SqlState = SqlState(Inner::E01P01);
+
+    /// 02000
+    pub const NO_DATA: SqlState = SqlState(Inner::E02000);
+
+    /// 02001
+    pub const NO_ADDITIONAL_DYNAMIC_RESULT_SETS_RETURNED: SqlState = SqlState(Inner::E02001);
+
+    /// 03000
+    pub const SQL_STATEMENT_NOT_YET_COMPLETE: SqlState = SqlState(Inner::E03000);
+
+    /// 08000
+    pub const CONNECTION_EXCEPTION: SqlState = SqlState(Inner::E08000);
+
+    /// 08003
+    pub const CONNECTION_DOES_NOT_EXIST: SqlState = SqlState(Inner::E08003);
+
+    /// 08006
+    pub const CONNECTION_FAILURE: SqlState = SqlState(Inner::E08006);
+
+    /// 08001
+    pub const SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION: SqlState = SqlState(Inner::E08001);
+
+    /// 08004
+    pub const SQLSERVER_REJECTED_ESTABLISHMENT_OF_SQLCONNECTION: SqlState = SqlState(Inner::E08004);
+
+    /// 08007
+    pub const TRANSACTION_RESOLUTION_UNKNOWN: SqlState = SqlState(Inner::E08007);
+
+    /// 08P01
+    pub const PROTOCOL_VIOLATION: SqlState = SqlState(Inner::E08P01);
+
+    /// 09000
+    pub const TRIGGERED_ACTION_EXCEPTION: SqlState = SqlState(Inner::E09000);
+
+    /// 0A000
+    pub const FEATURE_NOT_SUPPORTED: SqlState = SqlState(Inner::E0A000);
+
+    /// 0B000
+    pub const INVALID_TRANSACTION_INITIATION: SqlState = SqlState(Inner::E0B000);
+
+    /// 0F000
+    pub const LOCATOR_EXCEPTION: SqlState = SqlState(Inner::E0F000);
+
+    /// 0F001
+    pub const L_E_INVALID_SPECIFICATION: SqlState = SqlState(Inner::E0F001);
+
+    /// 0L000
+    pub const INVALID_GRANTOR: SqlState = SqlState(Inner::E0L000);
+
+    /// 0LP01
+    pub const INVALID_GRANT_OPERATION: SqlState = SqlState(Inner::E0LP01);
+
+    /// 0P000
+    pub const INVALID_ROLE_SPECIFICATION: SqlState = SqlState(Inner::E0P000);
+
+    /// 0Z000
+    pub const DIAGNOSTICS_EXCEPTION: SqlState = SqlState(Inner::E0Z000);
+
+    /// 0Z002
+    pub const STACKED_DIAGNOSTICS_ACCESSED_WITHOUT_ACTIVE_HANDLER: SqlState =
+        SqlState(Inner::E0Z002);
+
+    /// 20000
+    pub const CASE_NOT_FOUND: SqlState = SqlState(Inner::E20000);
+
+    /// 21000
+    pub const CARDINALITY_VIOLATION: SqlState = SqlState(Inner::E21000);
+
+    /// 22000
+    pub const DATA_EXCEPTION: SqlState = SqlState(Inner::E22000);
+
+    /// 2202E
+    pub const ARRAY_ELEMENT_ERROR: SqlState = SqlState(Inner::E2202E);
+
+    /// 2202E
+    pub const ARRAY_SUBSCRIPT_ERROR: SqlState = SqlState(Inner::E2202E);
+
+    /// 22021
+    pub const CHARACTER_NOT_IN_REPERTOIRE: SqlState = SqlState(Inner::E22021);
+
+    /// 22008
+    pub const DATETIME_FIELD_OVERFLOW: SqlState = SqlState(Inner::E22008);
+
+    /// 22008
+    pub const DATETIME_VALUE_OUT_OF_RANGE: SqlState = SqlState(Inner::E22008);
+
+    /// 22012
+    pub const DIVISION_BY_ZERO: SqlState = SqlState(Inner::E22012);
+
+    /// 22005
+    pub const ERROR_IN_ASSIGNMENT: SqlState = SqlState(Inner::E22005);
+
+    /// 2200B
+    pub const ESCAPE_CHARACTER_CONFLICT: SqlState = SqlState(Inner::E2200B);
+
+    /// 22022
+    pub const INDICATOR_OVERFLOW: SqlState = SqlState(Inner::E22022);
+
+    /// 22015
+    pub const INTERVAL_FIELD_OVERFLOW: SqlState = SqlState(Inner::E22015);
+
+    /// 2201E
+    pub const INVALID_ARGUMENT_FOR_LOG: SqlState = SqlState(Inner::E2201E);
+
+    /// 22014
+    pub const INVALID_ARGUMENT_FOR_NTILE: SqlState = SqlState(Inner::E22014);
+
+    /// 22016
+    pub const INVALID_ARGUMENT_FOR_NTH_VALUE: SqlState = SqlState(Inner::E22016);
+
+    /// 2201F
+    pub const INVALID_ARGUMENT_FOR_POWER_FUNCTION: SqlState = SqlState(Inner::E2201F);
+
+    /// 2201G
+    pub const INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION: SqlState = SqlState(Inner::E2201G);
+
+    /// 22018
+    pub const INVALID_CHARACTER_VALUE_FOR_CAST: SqlState = SqlState(Inner::E22018);
+
+    /// 22007
+    pub const INVALID_DATETIME_FORMAT: SqlState = SqlState(Inner::E22007);
+
+    /// 22019
+    pub const INVALID_ESCAPE_CHARACTER: SqlState = SqlState(Inner::E22019);
+
+    /// 2200D
+    pub const INVALID_ESCAPE_OCTET: SqlState = SqlState(Inner::E2200D);
+
+    /// 22025
+    pub const INVALID_ESCAPE_SEQUENCE: SqlState = SqlState(Inner::E22025);
+
+    /// 22P06
+    pub const NONSTANDARD_USE_OF_ESCAPE_CHARACTER: SqlState = SqlState(Inner::E22P06);
+
+    /// 22010
+    pub const INVALID_INDICATOR_PARAMETER_VALUE: SqlState = SqlState(Inner::E22010);
+
+    /// 22023
+    pub const INVALID_PARAMETER_VALUE: SqlState = SqlState(Inner::E22023);
+
+    /// 22013
+    pub const INVALID_PRECEDING_OR_FOLLOWING_SIZE: SqlState = SqlState(Inner::E22013);
+
+    /// 2201B
+    pub const INVALID_REGULAR_EXPRESSION: SqlState = SqlState(Inner::E2201B);
+
+    /// 2201W
+    pub const INVALID_ROW_COUNT_IN_LIMIT_CLAUSE: SqlState = SqlState(Inner::E2201W);
+
+    /// 2201X
+    pub const INVALID_ROW_COUNT_IN_RESULT_OFFSET_CLAUSE: SqlState = SqlState(Inner::E2201X);
+
+    /// 2202H
+    pub const INVALID_TABLESAMPLE_ARGUMENT: SqlState = SqlState(Inner::E2202H);
+
+    /// 2202G
+    pub const INVALID_TABLESAMPLE_REPEAT: SqlState = SqlState(Inner::E2202G);
+
+    /// 22009
+    pub const INVALID_TIME_ZONE_DISPLACEMENT_VALUE: SqlState = SqlState(Inner::E22009);
+
+    /// 2200C
+    pub const INVALID_USE_OF_ESCAPE_CHARACTER: SqlState = SqlState(Inner::E2200C);
+
+    /// 2200G
+    pub const MOST_SPECIFIC_TYPE_MISMATCH: SqlState = SqlState(Inner::E2200G);
+
+    /// 22004
+    pub const NULL_VALUE_NOT_ALLOWED: SqlState = SqlState(Inner::E22004);
+
+    /// 22002
+    pub const NULL_VALUE_NO_INDICATOR_PARAMETER: SqlState = SqlState(Inner::E22002);
+
+    /// 22003
+    pub const NUMERIC_VALUE_OUT_OF_RANGE: SqlState = SqlState(Inner::E22003);
+
+    /// 2200H
+    pub const SEQUENCE_GENERATOR_LIMIT_EXCEEDED: SqlState = SqlState(Inner::E2200H);
+
+    /// 22026
+    pub const STRING_DATA_LENGTH_MISMATCH: SqlState = SqlState(Inner::E22026);
+
+    /// 22001
+    pub const STRING_DATA_RIGHT_TRUNCATION: SqlState = SqlState(Inner::E22001);
+
+    /// 22011
+    pub const SUBSTRING_ERROR: SqlState = SqlState(Inner::E22011);
+
+    /// 22027
+    pub const TRIM_ERROR: SqlState = SqlState(Inner::E22027);
+
+    /// 22024
+    pub const UNTERMINATED_C_STRING: SqlState = SqlState(Inner::E22024);
+
+    /// 2200F
+    pub const ZERO_LENGTH_CHARACTER_STRING: SqlState = SqlState(Inner::E2200F);
+
+    /// 22P01
+    pub const FLOATING_POINT_EXCEPTION: SqlState = SqlState(Inner::E22P01);
+
+    /// 22P02
+    pub const INVALID_TEXT_REPRESENTATION: SqlState = SqlState(Inner::E22P02);
+
+    /// 22P03
+    pub const INVALID_BINARY_REPRESENTATION: SqlState = SqlState(Inner::E22P03);
+
+    /// 22P04
+    pub const BAD_COPY_FILE_FORMAT: SqlState = SqlState(Inner::E22P04);
+
+    /// 22P05
+    pub const UNTRANSLATABLE_CHARACTER: SqlState = SqlState(Inner::E22P05);
+
+    /// 2200L
+    pub const NOT_AN_XML_DOCUMENT: SqlState = SqlState(Inner::E2200L);
+
+    /// 2200M
+    pub const INVALID_XML_DOCUMENT: SqlState = SqlState(Inner::E2200M);
+
+    /// 2200N
+    pub const INVALID_XML_CONTENT: SqlState = SqlState(Inner::E2200N);
+
+    /// 2200S
+    pub const INVALID_XML_COMMENT: SqlState = SqlState(Inner::E2200S);
+
+    /// 2200T
+    pub const INVALID_XML_PROCESSING_INSTRUCTION: SqlState = SqlState(Inner::E2200T);
+
+    /// 22030
+    pub const DUPLICATE_JSON_OBJECT_KEY_VALUE: SqlState = SqlState(Inner::E22030);
+
+    /// 22031
+    pub const INVALID_ARGUMENT_FOR_SQL_JSON_DATETIME_FUNCTION: SqlState = SqlState(Inner::E22031);
+
+    /// 22032
+    pub const INVALID_JSON_TEXT: SqlState = SqlState(Inner::E22032);
+
+    /// 22033
+    pub const INVALID_SQL_JSON_SUBSCRIPT: SqlState = SqlState(Inner::E22033);
+
+    /// 22034
+    pub const MORE_THAN_ONE_SQL_JSON_ITEM: SqlState = SqlState(Inner::E22034);
+
+    /// 22035
+    pub const NO_SQL_JSON_ITEM: SqlState = SqlState(Inner::E22035);
+
+    /// 22036
+    pub const NON_NUMERIC_SQL_JSON_ITEM: SqlState = SqlState(Inner::E22036);
+
+    /// 22037
+    pub const NON_UNIQUE_KEYS_IN_A_JSON_OBJECT: SqlState = SqlState(Inner::E22037);
+
+    /// 22038
+    pub const SINGLETON_SQL_JSON_ITEM_REQUIRED: SqlState = SqlState(Inner::E22038);
+
+    /// 22039
+    pub const SQL_JSON_ARRAY_NOT_FOUND: SqlState = SqlState(Inner::E22039);
+
+    /// 2203A
+    pub const SQL_JSON_MEMBER_NOT_FOUND: SqlState = SqlState(Inner::E2203A);
+
+    /// 2203B
+    pub const SQL_JSON_NUMBER_NOT_FOUND: SqlState = SqlState(Inner::E2203B);
+
+    /// 2203C
+    pub const SQL_JSON_OBJECT_NOT_FOUND: SqlState = SqlState(Inner::E2203C);
+
+    /// 2203D
+    pub const TOO_MANY_JSON_ARRAY_ELEMENTS: SqlState = SqlState(Inner::E2203D);
+
+    /// 2203E
+    pub const TOO_MANY_JSON_OBJECT_MEMBERS: SqlState = SqlState(Inner::E2203E);
+
+    /// 2203F
+    pub const SQL_JSON_SCALAR_REQUIRED: SqlState = SqlState(Inner::E2203F);
+
+    /// 2203G
+    pub const SQL_JSON_ITEM_CANNOT_BE_CAST_TO_TARGET_TYPE: SqlState = SqlState(Inner::E2203G);
+
+    /// 23000
+    pub const INTEGRITY_CONSTRAINT_VIOLATION: SqlState = SqlState(Inner::E23000);
+
+    /// 23001
+    pub const RESTRICT_VIOLATION: SqlState = SqlState(Inner::E23001);
+
+    /// 23502
+    pub const NOT_NULL_VIOLATION: SqlState = SqlState(Inner::E23502);
+
+    /// 23503
+    pub const FOREIGN_KEY_VIOLATION: SqlState = SqlState(Inner::E23503);
+
+    /// 23505
+    pub const UNIQUE_VIOLATION: SqlState = SqlState(Inner::E23505);
+
+    /// 23514
+    pub const CHECK_VIOLATION: SqlState = SqlState(Inner::E23514);
+
+    /// 23P01
+    pub const EXCLUSION_VIOLATION: SqlState = SqlState(Inner::E23P01);
+
+    /// 24000
+    pub const INVALID_CURSOR_STATE: SqlState = SqlState(Inner::E24000);
+
+    /// 25000
+    pub const INVALID_TRANSACTION_STATE: SqlState = SqlState(Inner::E25000);
+
+    /// 25001
+    pub const ACTIVE_SQL_TRANSACTION: SqlState = SqlState(Inner::E25001);
+
+    /// 25002
+    pub const BRANCH_TRANSACTION_ALREADY_ACTIVE: SqlState = SqlState(Inner::E25002);
+
+    /// 25008
+    pub const HELD_CURSOR_REQUIRES_SAME_ISOLATION_LEVEL: SqlState = SqlState(Inner::E25008);
+
+    /// 25003
+    pub const INAPPROPRIATE_ACCESS_MODE_FOR_BRANCH_TRANSACTION: SqlState = SqlState(Inner::E25003);
+
+    /// 25004
+    pub const INAPPROPRIATE_ISOLATION_LEVEL_FOR_BRANCH_TRANSACTION: SqlState =
+        SqlState(Inner::E25004);
+
+    /// 25005
+    pub const NO_ACTIVE_SQL_TRANSACTION_FOR_BRANCH_TRANSACTION: SqlState = SqlState(Inner::E25005);
+
+    /// 25006
+    pub const READ_ONLY_SQL_TRANSACTION: SqlState = SqlState(Inner::E25006);
+
+    /// 25007
+    pub const SCHEMA_AND_DATA_STATEMENT_MIXING_NOT_SUPPORTED: SqlState = SqlState(Inner::E25007);
+
+    /// 25P01
+    pub const NO_ACTIVE_SQL_TRANSACTION: SqlState = SqlState(Inner::E25P01);
+
+    /// 25P02
+    pub const IN_FAILED_SQL_TRANSACTION: SqlState = SqlState(Inner::E25P02);
+
+    /// 25P03
+    pub const IDLE_IN_TRANSACTION_SESSION_TIMEOUT: SqlState = SqlState(Inner::E25P03);
+
+    /// 26000
+    pub const INVALID_SQL_STATEMENT_NAME: SqlState = SqlState(Inner::E26000);
+
+    /// 26000
+    pub const UNDEFINED_PSTATEMENT: SqlState = SqlState(Inner::E26000);
+
+    /// 27000
+    pub const TRIGGERED_DATA_CHANGE_VIOLATION: SqlState = SqlState(Inner::E27000);
+
+    /// 28000
+    pub const INVALID_AUTHORIZATION_SPECIFICATION: SqlState = SqlState(Inner::E28000);
+
+    /// 28P01
+    pub const INVALID_PASSWORD: SqlState = SqlState(Inner::E28P01);
+
+    /// 2B000
+    pub const DEPENDENT_PRIVILEGE_DESCRIPTORS_STILL_EXIST: SqlState = SqlState(Inner::E2B000);
+
+    /// 2BP01
+    pub const DEPENDENT_OBJECTS_STILL_EXIST: SqlState = SqlState(Inner::E2BP01);
+
+    /// 2D000
+    pub const INVALID_TRANSACTION_TERMINATION: SqlState = SqlState(Inner::E2D000);
+
+    /// 2F000
+    pub const SQL_ROUTINE_EXCEPTION: SqlState = SqlState(Inner::E2F000);
+
+    /// 2F005
+    pub const S_R_E_FUNCTION_EXECUTED_NO_RETURN_STATEMENT: SqlState = SqlState(Inner::E2F005);
+
+    /// 2F002
+    pub const S_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E2F002);
+
+    /// 2F003
+    pub const S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED: SqlState = SqlState(Inner::E2F003);
+
+    /// 2F004
+    pub const S_R_E_READING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E2F004);
+
+    /// 34000
+    pub const INVALID_CURSOR_NAME: SqlState = SqlState(Inner::E34000);
+
+    /// 34000
+    pub const UNDEFINED_CURSOR: SqlState = SqlState(Inner::E34000);
+
+    /// 38000
+    pub const EXTERNAL_ROUTINE_EXCEPTION: SqlState = SqlState(Inner::E38000);
+
+    /// 38001
+    pub const E_R_E_CONTAINING_SQL_NOT_PERMITTED: SqlState = SqlState(Inner::E38001);
+
+    /// 38002
+    pub const E_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E38002);
+
+    /// 38003
+    pub const E_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED: SqlState = SqlState(Inner::E38003);
+
+    /// 38004
+    pub const E_R_E_READING_SQL_DATA_NOT_PERMITTED: SqlState = SqlState(Inner::E38004);
+
+    /// 39000
+    pub const EXTERNAL_ROUTINE_INVOCATION_EXCEPTION: SqlState = SqlState(Inner::E39000);
+
+    /// 39001
+    pub const E_R_I_E_INVALID_SQLSTATE_RETURNED: SqlState = SqlState(Inner::E39001);
+
+    /// 39004
+    pub const E_R_I_E_NULL_VALUE_NOT_ALLOWED: SqlState = SqlState(Inner::E39004);
+
+    /// 39P01
+    pub const E_R_I_E_TRIGGER_PROTOCOL_VIOLATED: SqlState = SqlState(Inner::E39P01);
+
+    /// 39P02
+    pub const E_R_I_E_SRF_PROTOCOL_VIOLATED: SqlState = SqlState(Inner::E39P02);
+
+    /// 39P03
+    pub const E_R_I_E_EVENT_TRIGGER_PROTOCOL_VIOLATED: SqlState = SqlState(Inner::E39P03);
+
+    /// 3B000
+    pub const SAVEPOINT_EXCEPTION: SqlState = SqlState(Inner::E3B000);
+
+    /// 3B001
+    pub const S_E_INVALID_SPECIFICATION: SqlState = SqlState(Inner::E3B001);
+
+    /// 3D000
+    pub const INVALID_CATALOG_NAME: SqlState = SqlState(Inner::E3D000);
+
+    /// 3D000
+    pub const UNDEFINED_DATABASE: SqlState = SqlState(Inner::E3D000);
+
+    /// 3F000
+    pub const INVALID_SCHEMA_NAME: SqlState = SqlState(Inner::E3F000);
+
+    /// 3F000
+    pub const UNDEFINED_SCHEMA: SqlState = SqlState(Inner::E3F000);
+
+    /// 40000
+    pub const TRANSACTION_ROLLBACK: SqlState = SqlState(Inner::E40000);
+
+    /// 40002
+    pub const T_R_INTEGRITY_CONSTRAINT_VIOLATION: SqlState = SqlState(Inner::E40002);
+
+    /// 40001
+    pub const T_R_SERIALIZATION_FAILURE: SqlState = SqlState(Inner::E40001);
+
+    /// 40003
+    pub const T_R_STATEMENT_COMPLETION_UNKNOWN: SqlState = SqlState(Inner::E40003);
+
+    /// 40P01
+    pub const T_R_DEADLOCK_DETECTED: SqlState = SqlState(Inner::E40P01);
+
+    /// 42000
+    pub const SYNTAX_ERROR_OR_ACCESS_RULE_VIOLATION: SqlState = SqlState(Inner::E42000);
+
+    /// 42601
+    pub const SYNTAX_ERROR: SqlState = SqlState(Inner::E42601);
+
+    /// 42501
+    pub const INSUFFICIENT_PRIVILEGE: SqlState = SqlState(Inner::E42501);
+
+    /// 42846
+    pub const CANNOT_COERCE: SqlState = SqlState(Inner::E42846);
+
+    /// 42803
+    pub const GROUPING_ERROR: SqlState = SqlState(Inner::E42803);
+
+    /// 42P20
+    pub const WINDOWING_ERROR: SqlState = SqlState(Inner::E42P20);
+
+    /// 42P19
+    pub const INVALID_RECURSION: SqlState = SqlState(Inner::E42P19);
+
+    /// 42830
+    pub const INVALID_FOREIGN_KEY: SqlState = SqlState(Inner::E42830);
+
+    /// 42602
+    pub const INVALID_NAME: SqlState = SqlState(Inner::E42602);
+
+    /// 42622
+    pub const NAME_TOO_LONG: SqlState = SqlState(Inner::E42622);
+
+    /// 42939
+    pub const RESERVED_NAME: SqlState = SqlState(Inner::E42939);
+
+    /// 42804
+    pub const DATATYPE_MISMATCH: SqlState = SqlState(Inner::E42804);
+
+    /// 42P18
+    pub const INDETERMINATE_DATATYPE: SqlState = SqlState(Inner::E42P18);
+
+    /// 42P21
+    pub const COLLATION_MISMATCH: SqlState = SqlState(Inner::E42P21);
+
+    /// 42P22
+    pub const INDETERMINATE_COLLATION: SqlState = SqlState(Inner::E42P22);
+
+    /// 42809
+    pub const WRONG_OBJECT_TYPE: SqlState = SqlState(Inner::E42809);
+
+    /// 428C9
+    pub const GENERATED_ALWAYS: SqlState = SqlState(Inner::E428C9);
+
+    /// 42703
+    pub const UNDEFINED_COLUMN: SqlState = SqlState(Inner::E42703);
+
+    /// 42883
+    pub const UNDEFINED_FUNCTION: SqlState = SqlState(Inner::E42883);
+
+    /// 42P01
+    pub const UNDEFINED_TABLE: SqlState = SqlState(Inner::E42P01);
+
+    /// 42P02
+    pub const UNDEFINED_PARAMETER: SqlState = SqlState(Inner::E42P02);
+
+    /// 42704
+    pub const UNDEFINED_OBJECT: SqlState = SqlState(Inner::E42704);
+
+    /// 42701
+    pub const DUPLICATE_COLUMN: SqlState = SqlState(Inner::E42701);
+
+    /// 42P03
+    pub const DUPLICATE_CURSOR: SqlState = SqlState(Inner::E42P03);
+
+    /// 42P04
+    pub const DUPLICATE_DATABASE: SqlState = SqlState(Inner::E42P04);
+
+    /// 42723
+    pub const DUPLICATE_FUNCTION: SqlState = SqlState(Inner::E42723);
+
+    /// 42P05
+    pub const DUPLICATE_PSTATEMENT: SqlState = SqlState(Inner::E42P05);
+
+    /// 42P06
+    pub const DUPLICATE_SCHEMA: SqlState = SqlState(Inner::E42P06);
+
+    /// 42P07
+    pub const DUPLICATE_TABLE: SqlState = SqlState(Inner::E42P07);
+
+    /// 42712
+    pub const DUPLICATE_ALIAS: SqlState = SqlState(Inner::E42712);
+
+    /// 42710
+    pub const DUPLICATE_OBJECT: SqlState = SqlState(Inner::E42710);
+
+    /// 42702
+    pub const AMBIGUOUS_COLUMN: SqlState = SqlState(Inner::E42702);
+
+    /// 42725
+    pub const AMBIGUOUS_FUNCTION: SqlState = SqlState(Inner::E42725);
+
+    /// 42P08
+    pub const AMBIGUOUS_PARAMETER: SqlState = SqlState(Inner::E42P08);
+
+    /// 42P09
+    pub const AMBIGUOUS_ALIAS: SqlState = SqlState(Inner::E42P09);
+
+    /// 42P10
+    pub const INVALID_COLUMN_REFERENCE: SqlState = SqlState(Inner::E42P10);
+
+    /// 42611
+    pub const INVALID_COLUMN_DEFINITION: SqlState = SqlState(Inner::E42611);
+
+    /// 42P11
+    pub const INVALID_CURSOR_DEFINITION: SqlState = SqlState(Inner::E42P11);
+
+    /// 42P12
+    pub const INVALID_DATABASE_DEFINITION: SqlState = SqlState(Inner::E42P12);
+
+    /// 42P13
+    pub const INVALID_FUNCTION_DEFINITION: SqlState = SqlState(Inner::E42P13);
+
+    /// 42P14
+    pub const INVALID_PSTATEMENT_DEFINITION: SqlState = SqlState(Inner::E42P14);
+
+    /// 42P15
+    pub const INVALID_SCHEMA_DEFINITION: SqlState = SqlState(Inner::E42P15);
+
+    /// 42P16
+    pub const INVALID_TABLE_DEFINITION: SqlState = SqlState(Inner::E42P16);
+
+    /// 42P17
+    pub const INVALID_OBJECT_DEFINITION: SqlState = SqlState(Inner::E42P17);
+
+    /// 44000
+    pub const WITH_CHECK_OPTION_VIOLATION: SqlState = SqlState(Inner::E44000);
+
+    /// 53000
+    pub const INSUFFICIENT_RESOURCES: SqlState = SqlState(Inner::E53000);
+
+    /// 53100
+    pub const DISK_FULL: SqlState = SqlState(Inner::E53100);
+
+    /// 53200
+    pub const OUT_OF_MEMORY: SqlState = SqlState(Inner::E53200);
+
+    /// 53300
+    pub const TOO_MANY_CONNECTIONS: SqlState = SqlState(Inner::E53300);
+
+    /// 53400
+    pub const CONFIGURATION_LIMIT_EXCEEDED: SqlState = SqlState(Inner::E53400);
+
+    /// 54000
+    pub const PROGRAM_LIMIT_EXCEEDED: SqlState = SqlState(Inner::E54000);
+
+    /// 54001
+    pub const STATEMENT_TOO_COMPLEX: SqlState = SqlState(Inner::E54001);
+
+    /// 54011
+    pub const TOO_MANY_COLUMNS: SqlState = SqlState(Inner::E54011);
+
+    /// 54023
+    pub const TOO_MANY_ARGUMENTS: SqlState = SqlState(Inner::E54023);
+
+    /// 55000
+    pub const OBJECT_NOT_IN_PREREQUISITE_STATE: SqlState = SqlState(Inner::E55000);
+
+    /// 55006
+    pub const OBJECT_IN_USE: SqlState = SqlState(Inner::E55006);
+
+    /// 55P02
+    pub const CANT_CHANGE_RUNTIME_PARAM: SqlState = SqlState(Inner::E55P02);
+
+    /// 55P03
+    pub const LOCK_NOT_AVAILABLE: SqlState = SqlState(Inner::E55P03);
+
+    /// 55P04
+    pub const UNSAFE_NEW_ENUM_VALUE_USAGE: SqlState = SqlState(Inner::E55P04);
+
+    /// 57000
+    pub const OPERATOR_INTERVENTION: SqlState = SqlState(Inner::E57000);
+
+    /// 57014
+    pub const QUERY_CANCELED: SqlState = SqlState(Inner::E57014);
+
+    /// 57P01
+    pub const ADMIN_SHUTDOWN: SqlState = SqlState(Inner::E57P01);
+
+    /// 57P02
+    pub const CRASH_SHUTDOWN: SqlState = SqlState(Inner::E57P02);
+
+    /// 57P03
+    pub const CANNOT_CONNECT_NOW: SqlState = SqlState(Inner::E57P03);
+
+    /// 57P04
+    pub const DATABASE_DROPPED: SqlState = SqlState(Inner::E57P04);
+
+    /// 57P05
+    pub const IDLE_SESSION_TIMEOUT: SqlState = SqlState(Inner::E57P05);
+
+    /// 58000
+    pub const SYSTEM_ERROR: SqlState = SqlState(Inner::E58000);
+
+    /// 58030
+    pub const IO_ERROR: SqlState = SqlState(Inner::E58030);
+
+    /// 58P01
+    pub const UNDEFINED_FILE: SqlState = SqlState(Inner::E58P01);
+
+    /// 58P02
+    pub const DUPLICATE_FILE: SqlState = SqlState(Inner::E58P02);
+
+    /// 72000
+    pub const SNAPSHOT_TOO_OLD: SqlState = SqlState(Inner::E72000);
+
+    /// F0000
+    pub const CONFIG_FILE_ERROR: SqlState = SqlState(Inner::EF0000);
+
+    /// F0001
+    pub const LOCK_FILE_EXISTS: SqlState = SqlState(Inner::EF0001);
+
+    /// HV000
+    pub const FDW_ERROR: SqlState = SqlState(Inner::EHV000);
+
+    /// HV005
+    pub const FDW_COLUMN_NAME_NOT_FOUND: SqlState = SqlState(Inner::EHV005);
+
+    /// HV002
+    pub const FDW_DYNAMIC_PARAMETER_VALUE_NEEDED: SqlState = SqlState(Inner::EHV002);
+
+    /// HV010
+    pub const FDW_FUNCTION_SEQUENCE_ERROR: SqlState = SqlState(Inner::EHV010);
+
+    /// HV021
+    pub const FDW_INCONSISTENT_DESCRIPTOR_INFORMATION: SqlState = SqlState(Inner::EHV021);
+
+    /// HV024
+    pub const FDW_INVALID_ATTRIBUTE_VALUE: SqlState = SqlState(Inner::EHV024);
+
+    /// HV007
+    pub const FDW_INVALID_COLUMN_NAME: SqlState = SqlState(Inner::EHV007);
+
+    /// HV008
+    pub const FDW_INVALID_COLUMN_NUMBER: SqlState = SqlState(Inner::EHV008);
+
+    /// HV004
+    pub const FDW_INVALID_DATA_TYPE: SqlState = SqlState(Inner::EHV004);
+
+    /// HV006
+    pub const FDW_INVALID_DATA_TYPE_DESCRIPTORS: SqlState = SqlState(Inner::EHV006);
+
+    /// HV091
+    pub const FDW_INVALID_DESCRIPTOR_FIELD_IDENTIFIER: SqlState = SqlState(Inner::EHV091);
+
+    /// HV00B
+    pub const FDW_INVALID_HANDLE: SqlState = SqlState(Inner::EHV00B);
+
+    /// HV00C
+    pub const FDW_INVALID_OPTION_INDEX: SqlState = SqlState(Inner::EHV00C);
+
+    /// HV00D
+    pub const FDW_INVALID_OPTION_NAME: SqlState = SqlState(Inner::EHV00D);
+
+    /// HV090
+    pub const FDW_INVALID_STRING_LENGTH_OR_BUFFER_LENGTH: SqlState = SqlState(Inner::EHV090);
+
+    /// HV00A
+    pub const FDW_INVALID_STRING_FORMAT: SqlState = SqlState(Inner::EHV00A);
+
+    /// HV009
+    pub const FDW_INVALID_USE_OF_NULL_POINTER: SqlState = SqlState(Inner::EHV009);
+
+    /// HV014
+    pub const FDW_TOO_MANY_HANDLES: SqlState = SqlState(Inner::EHV014);
+
+    /// HV001
+    pub const FDW_OUT_OF_MEMORY: SqlState = SqlState(Inner::EHV001);
+
+    /// HV00P
+    pub const FDW_NO_SCHEMAS: SqlState = SqlState(Inner::EHV00P);
+
+    /// HV00J
+    pub const FDW_OPTION_NAME_NOT_FOUND: SqlState = SqlState(Inner::EHV00J);
+
+    /// HV00K
+    pub const FDW_REPLY_HANDLE: SqlState = SqlState(Inner::EHV00K);
+
+    /// HV00Q
+    pub const FDW_SCHEMA_NOT_FOUND: SqlState = SqlState(Inner::EHV00Q);
+
+    /// HV00R
+    pub const FDW_TABLE_NOT_FOUND: SqlState = SqlState(Inner::EHV00R);
+
+    /// HV00L
+    pub const FDW_UNABLE_TO_CREATE_EXECUTION: SqlState = SqlState(Inner::EHV00L);
+
+    /// HV00M
+    pub const FDW_UNABLE_TO_CREATE_REPLY: SqlState = SqlState(Inner::EHV00M);
+
+    /// HV00N
+    pub const FDW_UNABLE_TO_ESTABLISH_CONNECTION: SqlState = SqlState(Inner::EHV00N);
+
+    /// P0000
+    pub const PLPGSQL_ERROR: SqlState = SqlState(Inner::EP0000);
+
+    /// P0001
+    pub const RAISE_EXCEPTION: SqlState = SqlState(Inner::EP0001);
+
+    /// P0002
+    pub const NO_DATA_FOUND: SqlState = SqlState(Inner::EP0002);
+
+    /// P0003
+    pub const TOO_MANY_ROWS: SqlState = SqlState(Inner::EP0003);
+
+    /// P0004
+    pub const ASSERT_FAILURE: SqlState = SqlState(Inner::EP0004);
+
+    /// XX000
+    pub const INTERNAL_ERROR: SqlState = SqlState(Inner::EXX000);
+
+    /// XX001
+    pub const DATA_CORRUPTED: SqlState = SqlState(Inner::EXX001);
+
+    /// XX002
+    pub const INDEX_CORRUPTED: SqlState = SqlState(Inner::EXX002);
+}
+
+#[derive(PartialEq, Eq, Clone, Debug)]
+#[allow(clippy::upper_case_acronyms)]
+enum Inner {
+    E00000,
+    E01000,
+    E0100C,
+    E01008,
+    E01003,
+    E01007,
+    E01006,
+    E01004,
+    E01P01,
+    E02000,
+    E02001,
+    E03000,
+    E08000,
+    E08003,
+    E08006,
+    E08001,
+    E08004,
+    E08007,
+    E08P01,
+    E09000,
+    E0A000,
+    E0B000,
+    E0F000,
+    E0F001,
+    E0L000,
+    E0LP01,
+    E0P000,
+    E0Z000,
+    E0Z002,
+    E20000,
+    E21000,
+    E22000,
+    E2202E,
+    E22021,
+    E22008,
+    E22012,
+    E22005,
+    E2200B,
+    E22022,
+    E22015,
+    E2201E,
+    E22014,
+    E22016,
+    E2201F,
+    E2201G,
+    E22018,
+    E22007,
+    E22019,
+    E2200D,
+    E22025,
+    E22P06,
+    E22010,
+    E22023,
+    E22013,
+    E2201B,
+    E2201W,
+    E2201X,
+    E2202H,
+    E2202G,
+    E22009,
+    E2200C,
+    E2200G,
+    E22004,
+    E22002,
+    E22003,
+    E2200H,
+    E22026,
+    E22001,
+    E22011,
+    E22027,
+    E22024,
+    E2200F,
+    E22P01,
+    E22P02,
+    E22P03,
+    E22P04,
+    E22P05,
+    E2200L,
+    E2200M,
+    E2200N,
+    E2200S,
+    E2200T,
+    E22030,
+    E22031,
+    E22032,
+    E22033,
+    E22034,
+    E22035,
+    E22036,
+    E22037,
+    E22038,
+    E22039,
+    E2203A,
+    E2203B,
+    E2203C,
+    E2203D,
+    E2203E,
+    E2203F,
+    E2203G,
+    E23000,
+    E23001,
+    E23502,
+    E23503,
+    E23505,
+    E23514,
+    E23P01,
+    E24000,
+    E25000,
+    E25001,
+    E25002,
+    E25008,
+    E25003,
+    E25004,
+    E25005,
+    E25006,
+    E25007,
+    E25P01,
+    E25P02,
+    E25P03,
+    E26000,
+    E27000,
+    E28000,
+    E28P01,
+    E2B000,
+    E2BP01,
+    E2D000,
+    E2F000,
+    E2F005,
+    E2F002,
+    E2F003,
+    E2F004,
+    E34000,
+    E38000,
+    E38001,
+    E38002,
+    E38003,
+    E38004,
+    E39000,
+    E39001,
+    E39004,
+    E39P01,
+    E39P02,
+    E39P03,
+    E3B000,
+    E3B001,
+    E3D000,
+    E3F000,
+    E40000,
+    E40002,
+    E40001,
+    E40003,
+    E40P01,
+    E42000,
+    E42601,
+    E42501,
+    E42846,
+    E42803,
+    E42P20,
+    E42P19,
+    E42830,
+    E42602,
+    E42622,
+    E42939,
+    E42804,
+    E42P18,
+    E42P21,
+    E42P22,
+    E42809,
+    E428C9,
+    E42703,
+    E42883,
+    E42P01,
+    E42P02,
+    E42704,
+    E42701,
+    E42P03,
+    E42P04,
+    E42723,
+    E42P05,
+    E42P06,
+    E42P07,
+    E42712,
+    E42710,
+    E42702,
+    E42725,
+    E42P08,
+    E42P09,
+    E42P10,
+    E42611,
+    E42P11,
+    E42P12,
+    E42P13,
+    E42P14,
+    E42P15,
+    E42P16,
+    E42P17,
+    E44000,
+    E53000,
+    E53100,
+    E53200,
+    E53300,
+    E53400,
+    E54000,
+    E54001,
+    E54011,
+    E54023,
+    E55000,
+    E55006,
+    E55P02,
+    E55P03,
+    E55P04,
+    E57000,
+    E57014,
+    E57P01,
+    E57P02,
+    E57P03,
+    E57P04,
+    E57P05,
+    E58000,
+    E58030,
+    E58P01,
+    E58P02,
+    E72000,
+    EF0000,
+    EF0001,
+    EHV000,
+    EHV005,
+    EHV002,
+    EHV010,
+    EHV021,
+    EHV024,
+    EHV007,
+    EHV008,
+    EHV004,
+    EHV006,
+    EHV091,
+    EHV00B,
+    EHV00C,
+    EHV00D,
+    EHV090,
+    EHV00A,
+    EHV009,
+    EHV014,
+    EHV001,
+    EHV00P,
+    EHV00J,
+    EHV00K,
+    EHV00Q,
+    EHV00R,
+    EHV00L,
+    EHV00M,
+    EHV00N,
+    EP0000,
+    EP0001,
+    EP0002,
+    EP0003,
+    EP0004,
+    EXX000,
+    EXX001,
+    EXX002,
+    Other(Box<str>),
+}
+
+#[rustfmt::skip]
+static SQLSTATE_MAP: phf::Map<&'static str, SqlState> = 
+::phf::Map {
+    key: 12913932095322966823,
+    disps: &[
+        (0, 24),
+        (0, 12),
+        (0, 74),
+        (0, 109),
+        (0, 11),
+        (0, 9),
+        (0, 0),
+        (4, 38),
+        (3, 155),
+        (0, 6),
+        (1, 242),
+        (0, 66),
+        (0, 53),
+        (5, 180),
+        (3, 221),
+        (7, 230),
+        (0, 125),
+        (1, 46),
+        (0, 11),
+        (1, 2),
+        (0, 5),
+        (0, 13),
+        (0, 171),
+        (0, 15),
+        (0, 4),
+        (0, 22),
+        (1, 85),
+        (0, 75),
+        (2, 0),
+        (1, 25),
+        (7, 47),
+        (0, 45),
+        (0, 35),
+        (0, 7),
+        (7, 124),
+        (0, 0),
+        (14, 104),
+        (1, 183),
+        (61, 50),
+        (3, 76),
+        (0, 12),
+        (0, 7),
+        (4, 189),
+        (0, 1),
+        (64, 102),
+        (0, 0),
+        (16, 192),
+        (24, 19),
+        (0, 5),
+        (0, 87),
+        (0, 89),
+        (0, 14),
+    ],
+    entries: &[
+        ("2F000", SqlState::SQL_ROUTINE_EXCEPTION),
+        ("01008", SqlState::WARNING_IMPLICIT_ZERO_BIT_PADDING),
+        ("42501", SqlState::INSUFFICIENT_PRIVILEGE),
+        ("22000", SqlState::DATA_EXCEPTION),
+        ("0100C", SqlState::WARNING_DYNAMIC_RESULT_SETS_RETURNED),
+        ("2200N", SqlState::INVALID_XML_CONTENT),
+        ("40001", SqlState::T_R_SERIALIZATION_FAILURE),
+        ("28P01", SqlState::INVALID_PASSWORD),
+        ("38000", SqlState::EXTERNAL_ROUTINE_EXCEPTION),
+        ("25006", SqlState::READ_ONLY_SQL_TRANSACTION),
+        ("2203D", SqlState::TOO_MANY_JSON_ARRAY_ELEMENTS),
+        ("42P09", SqlState::AMBIGUOUS_ALIAS),
+        ("F0000", SqlState::CONFIG_FILE_ERROR),
+        ("42P18", SqlState::INDETERMINATE_DATATYPE),
+        ("40002", SqlState::T_R_INTEGRITY_CONSTRAINT_VIOLATION),
+        ("22009", SqlState::INVALID_TIME_ZONE_DISPLACEMENT_VALUE),
+        ("42P08", SqlState::AMBIGUOUS_PARAMETER),
+        ("08000", SqlState::CONNECTION_EXCEPTION),
+        ("25P01", SqlState::NO_ACTIVE_SQL_TRANSACTION),
+        ("22024", SqlState::UNTERMINATED_C_STRING),
+        ("55000", SqlState::OBJECT_NOT_IN_PREREQUISITE_STATE),
+        ("25001", SqlState::ACTIVE_SQL_TRANSACTION),
+        ("03000", SqlState::SQL_STATEMENT_NOT_YET_COMPLETE),
+        ("42710", SqlState::DUPLICATE_OBJECT),
+        ("2D000", SqlState::INVALID_TRANSACTION_TERMINATION),
+        ("2200G", SqlState::MOST_SPECIFIC_TYPE_MISMATCH),
+        ("22022", SqlState::INDICATOR_OVERFLOW),
+        ("55006", SqlState::OBJECT_IN_USE),
+        ("53200", SqlState::OUT_OF_MEMORY),
+        ("22012", SqlState::DIVISION_BY_ZERO),
+        ("P0002", SqlState::NO_DATA_FOUND),
+        ("XX001", SqlState::DATA_CORRUPTED),
+        ("22P05", SqlState::UNTRANSLATABLE_CHARACTER),
+        ("40003", SqlState::T_R_STATEMENT_COMPLETION_UNKNOWN),
+        ("22021", SqlState::CHARACTER_NOT_IN_REPERTOIRE),
+        ("25000", SqlState::INVALID_TRANSACTION_STATE),
+        ("42P15", SqlState::INVALID_SCHEMA_DEFINITION),
+        ("0B000", SqlState::INVALID_TRANSACTION_INITIATION),
+        ("22004", SqlState::NULL_VALUE_NOT_ALLOWED),
+        ("42804", SqlState::DATATYPE_MISMATCH),
+        ("42803", SqlState::GROUPING_ERROR),
+        ("02001", SqlState::NO_ADDITIONAL_DYNAMIC_RESULT_SETS_RETURNED),
+        ("25002", SqlState::BRANCH_TRANSACTION_ALREADY_ACTIVE),
+        ("28000", SqlState::INVALID_AUTHORIZATION_SPECIFICATION),
+        ("HV009", SqlState::FDW_INVALID_USE_OF_NULL_POINTER),
+        ("22P01", SqlState::FLOATING_POINT_EXCEPTION),
+        ("2B000", SqlState::DEPENDENT_PRIVILEGE_DESCRIPTORS_STILL_EXIST),
+        ("42723", SqlState::DUPLICATE_FUNCTION),
+        ("21000", SqlState::CARDINALITY_VIOLATION),
+        ("0Z002", SqlState::STACKED_DIAGNOSTICS_ACCESSED_WITHOUT_ACTIVE_HANDLER),
+        ("23505", SqlState::UNIQUE_VIOLATION),
+        ("HV00J", SqlState::FDW_OPTION_NAME_NOT_FOUND),
+        ("23P01", SqlState::EXCLUSION_VIOLATION),
+        ("39P03", SqlState::E_R_I_E_EVENT_TRIGGER_PROTOCOL_VIOLATED),
+        ("42P10", SqlState::INVALID_COLUMN_REFERENCE),
+        ("2202H", SqlState::INVALID_TABLESAMPLE_ARGUMENT),
+        ("55P04", SqlState::UNSAFE_NEW_ENUM_VALUE_USAGE),
+        ("P0000", SqlState::PLPGSQL_ERROR),
+        ("2F005", SqlState::S_R_E_FUNCTION_EXECUTED_NO_RETURN_STATEMENT),
+        ("HV00M", SqlState::FDW_UNABLE_TO_CREATE_REPLY),
+        ("0A000", SqlState::FEATURE_NOT_SUPPORTED),
+        ("24000", SqlState::INVALID_CURSOR_STATE),
+        ("25008", SqlState::HELD_CURSOR_REQUIRES_SAME_ISOLATION_LEVEL),
+        ("01003", SqlState::WARNING_NULL_VALUE_ELIMINATED_IN_SET_FUNCTION),
+        ("42712", SqlState::DUPLICATE_ALIAS),
+        ("HV014", SqlState::FDW_TOO_MANY_HANDLES),
+        ("58030", SqlState::IO_ERROR),
+        ("2201W", SqlState::INVALID_ROW_COUNT_IN_LIMIT_CLAUSE),
+        ("22033", SqlState::INVALID_SQL_JSON_SUBSCRIPT),
+        ("2BP01", SqlState::DEPENDENT_OBJECTS_STILL_EXIST),
+        ("HV005", SqlState::FDW_COLUMN_NAME_NOT_FOUND),
+        ("25004", SqlState::INAPPROPRIATE_ISOLATION_LEVEL_FOR_BRANCH_TRANSACTION),
+        ("54000", SqlState::PROGRAM_LIMIT_EXCEEDED),
+        ("20000", SqlState::CASE_NOT_FOUND),
+        ("2203G", SqlState::SQL_JSON_ITEM_CANNOT_BE_CAST_TO_TARGET_TYPE),
+        ("22038", SqlState::SINGLETON_SQL_JSON_ITEM_REQUIRED),
+        ("22007", SqlState::INVALID_DATETIME_FORMAT),
+        ("08004", SqlState::SQLSERVER_REJECTED_ESTABLISHMENT_OF_SQLCONNECTION),
+        ("2200H", SqlState::SEQUENCE_GENERATOR_LIMIT_EXCEEDED),
+        ("HV00D", SqlState::FDW_INVALID_OPTION_NAME),
+        ("P0004", SqlState::ASSERT_FAILURE),
+        ("22018", SqlState::INVALID_CHARACTER_VALUE_FOR_CAST),
+        ("0L000", SqlState::INVALID_GRANTOR),
+        ("22P04", SqlState::BAD_COPY_FILE_FORMAT),
+        ("22031", SqlState::INVALID_ARGUMENT_FOR_SQL_JSON_DATETIME_FUNCTION),
+        ("01P01", SqlState::WARNING_DEPRECATED_FEATURE),
+        ("0LP01", SqlState::INVALID_GRANT_OPERATION),
+        ("58P02", SqlState::DUPLICATE_FILE),
+        ("26000", SqlState::INVALID_SQL_STATEMENT_NAME),
+        ("54001", SqlState::STATEMENT_TOO_COMPLEX),
+        ("22010", SqlState::INVALID_INDICATOR_PARAMETER_VALUE),
+        ("HV00C", SqlState::FDW_INVALID_OPTION_INDEX),
+        ("22008", SqlState::DATETIME_FIELD_OVERFLOW),
+        ("42P06", SqlState::DUPLICATE_SCHEMA),
+        ("25007", SqlState::SCHEMA_AND_DATA_STATEMENT_MIXING_NOT_SUPPORTED),
+        ("42P20", SqlState::WINDOWING_ERROR),
+        ("HV091", SqlState::FDW_INVALID_DESCRIPTOR_FIELD_IDENTIFIER),
+        ("HV021", SqlState::FDW_INCONSISTENT_DESCRIPTOR_INFORMATION),
+        ("42702", SqlState::AMBIGUOUS_COLUMN),
+        ("02000", SqlState::NO_DATA),
+        ("54011", SqlState::TOO_MANY_COLUMNS),
+        ("HV004", SqlState::FDW_INVALID_DATA_TYPE),
+        ("01006", SqlState::WARNING_PRIVILEGE_NOT_REVOKED),
+        ("42701", SqlState::DUPLICATE_COLUMN),
+        ("08P01", SqlState::PROTOCOL_VIOLATION),
+        ("42622", SqlState::NAME_TOO_LONG),
+        ("P0003", SqlState::TOO_MANY_ROWS),
+        ("22003", SqlState::NUMERIC_VALUE_OUT_OF_RANGE),
+        ("42P03", SqlState::DUPLICATE_CURSOR),
+        ("23001", SqlState::RESTRICT_VIOLATION),
+        ("57000", SqlState::OPERATOR_INTERVENTION),
+        ("22027", SqlState::TRIM_ERROR),
+        ("42P12", SqlState::INVALID_DATABASE_DEFINITION),
+        ("3B000", SqlState::SAVEPOINT_EXCEPTION),
+        ("2201B", SqlState::INVALID_REGULAR_EXPRESSION),
+        ("22030", SqlState::DUPLICATE_JSON_OBJECT_KEY_VALUE),
+        ("2F004", SqlState::S_R_E_READING_SQL_DATA_NOT_PERMITTED),
+        ("428C9", SqlState::GENERATED_ALWAYS),
+        ("2200S", SqlState::INVALID_XML_COMMENT),
+        ("22039", SqlState::SQL_JSON_ARRAY_NOT_FOUND),
+        ("42809", SqlState::WRONG_OBJECT_TYPE),
+        ("2201X", SqlState::INVALID_ROW_COUNT_IN_RESULT_OFFSET_CLAUSE),
+        ("39001", SqlState::E_R_I_E_INVALID_SQLSTATE_RETURNED),
+        ("25P02", SqlState::IN_FAILED_SQL_TRANSACTION),
+        ("0P000", SqlState::INVALID_ROLE_SPECIFICATION),
+        ("HV00N", SqlState::FDW_UNABLE_TO_ESTABLISH_CONNECTION),
+        ("53100", SqlState::DISK_FULL),
+        ("42601", SqlState::SYNTAX_ERROR),
+        ("23000", SqlState::INTEGRITY_CONSTRAINT_VIOLATION),
+        ("HV006", SqlState::FDW_INVALID_DATA_TYPE_DESCRIPTORS),
+        ("HV00B", SqlState::FDW_INVALID_HANDLE),
+        ("HV00Q", SqlState::FDW_SCHEMA_NOT_FOUND),
+        ("01000", SqlState::WARNING),
+        ("42883", SqlState::UNDEFINED_FUNCTION),
+        ("57P01", SqlState::ADMIN_SHUTDOWN),
+        ("22037", SqlState::NON_UNIQUE_KEYS_IN_A_JSON_OBJECT),
+        ("00000", SqlState::SUCCESSFUL_COMPLETION),
+        ("55P03", SqlState::LOCK_NOT_AVAILABLE),
+        ("42P01", SqlState::UNDEFINED_TABLE),
+        ("42830", SqlState::INVALID_FOREIGN_KEY),
+        ("22005", SqlState::ERROR_IN_ASSIGNMENT),
+        ("22025", SqlState::INVALID_ESCAPE_SEQUENCE),
+        ("XX002", SqlState::INDEX_CORRUPTED),
+        ("42P16", SqlState::INVALID_TABLE_DEFINITION),
+        ("55P02", SqlState::CANT_CHANGE_RUNTIME_PARAM),
+        ("22019", SqlState::INVALID_ESCAPE_CHARACTER),
+        ("P0001", SqlState::RAISE_EXCEPTION),
+        ("72000", SqlState::SNAPSHOT_TOO_OLD),
+        ("42P11", SqlState::INVALID_CURSOR_DEFINITION),
+        ("40P01", SqlState::T_R_DEADLOCK_DETECTED),
+        ("57P02", SqlState::CRASH_SHUTDOWN),
+        ("HV00A", SqlState::FDW_INVALID_STRING_FORMAT),
+        ("2F002", SqlState::S_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED),
+        ("23503", SqlState::FOREIGN_KEY_VIOLATION),
+        ("40000", SqlState::TRANSACTION_ROLLBACK),
+        ("22032", SqlState::INVALID_JSON_TEXT),
+        ("2202E", SqlState::ARRAY_ELEMENT_ERROR),
+        ("42P19", SqlState::INVALID_RECURSION),
+        ("42611", SqlState::INVALID_COLUMN_DEFINITION),
+        ("42P13", SqlState::INVALID_FUNCTION_DEFINITION),
+        ("25003", SqlState::INAPPROPRIATE_ACCESS_MODE_FOR_BRANCH_TRANSACTION),
+        ("39P02", SqlState::E_R_I_E_SRF_PROTOCOL_VIOLATED),
+        ("XX000", SqlState::INTERNAL_ERROR),
+        ("08006", SqlState::CONNECTION_FAILURE),
+        ("57P04", SqlState::DATABASE_DROPPED),
+        ("42P07", SqlState::DUPLICATE_TABLE),
+        ("22P03", SqlState::INVALID_BINARY_REPRESENTATION),
+        ("22035", SqlState::NO_SQL_JSON_ITEM),
+        ("42P14", SqlState::INVALID_PSTATEMENT_DEFINITION),
+        ("01007", SqlState::WARNING_PRIVILEGE_NOT_GRANTED),
+        ("38004", SqlState::E_R_E_READING_SQL_DATA_NOT_PERMITTED),
+        ("42P21", SqlState::COLLATION_MISMATCH),
+        ("0Z000", SqlState::DIAGNOSTICS_EXCEPTION),
+        ("HV001", SqlState::FDW_OUT_OF_MEMORY),
+        ("0F000", SqlState::LOCATOR_EXCEPTION),
+        ("22013", SqlState::INVALID_PRECEDING_OR_FOLLOWING_SIZE),
+        ("2201E", SqlState::INVALID_ARGUMENT_FOR_LOG),
+        ("22011", SqlState::SUBSTRING_ERROR),
+        ("42602", SqlState::INVALID_NAME),
+        ("01004", SqlState::WARNING_STRING_DATA_RIGHT_TRUNCATION),
+        ("42P02", SqlState::UNDEFINED_PARAMETER),
+        ("2203C", SqlState::SQL_JSON_OBJECT_NOT_FOUND),
+        ("HV002", SqlState::FDW_DYNAMIC_PARAMETER_VALUE_NEEDED),
+        ("0F001", SqlState::L_E_INVALID_SPECIFICATION),
+        ("58P01", SqlState::UNDEFINED_FILE),
+        ("38001", SqlState::E_R_E_CONTAINING_SQL_NOT_PERMITTED),
+        ("42703", SqlState::UNDEFINED_COLUMN),
+        ("57P05", SqlState::IDLE_SESSION_TIMEOUT),
+        ("57P03", SqlState::CANNOT_CONNECT_NOW),
+        ("HV007", SqlState::FDW_INVALID_COLUMN_NAME),
+        ("22014", SqlState::INVALID_ARGUMENT_FOR_NTILE),
+        ("22P06", SqlState::NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
+        ("2203F", SqlState::SQL_JSON_SCALAR_REQUIRED),
+        ("2200F", SqlState::ZERO_LENGTH_CHARACTER_STRING),
+        ("09000", SqlState::TRIGGERED_ACTION_EXCEPTION),
+        ("2201F", SqlState::INVALID_ARGUMENT_FOR_POWER_FUNCTION),
+        ("08003", SqlState::CONNECTION_DOES_NOT_EXIST),
+        ("38002", SqlState::E_R_E_MODIFYING_SQL_DATA_NOT_PERMITTED),
+        ("F0001", SqlState::LOCK_FILE_EXISTS),
+        ("42P22", SqlState::INDETERMINATE_COLLATION),
+        ("2200C", SqlState::INVALID_USE_OF_ESCAPE_CHARACTER),
+        ("2203E", SqlState::TOO_MANY_JSON_OBJECT_MEMBERS),
+        ("23514", SqlState::CHECK_VIOLATION),
+        ("22P02", SqlState::INVALID_TEXT_REPRESENTATION),
+        ("54023", SqlState::TOO_MANY_ARGUMENTS),
+        ("2200T", SqlState::INVALID_XML_PROCESSING_INSTRUCTION),
+        ("22016", SqlState::INVALID_ARGUMENT_FOR_NTH_VALUE),
+        ("25P03", SqlState::IDLE_IN_TRANSACTION_SESSION_TIMEOUT),
+        ("3B001", SqlState::S_E_INVALID_SPECIFICATION),
+        ("08001", SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+        ("22036", SqlState::NON_NUMERIC_SQL_JSON_ITEM),
+        ("3F000", SqlState::INVALID_SCHEMA_NAME),
+        ("39P01", SqlState::E_R_I_E_TRIGGER_PROTOCOL_VIOLATED),
+        ("22026", SqlState::STRING_DATA_LENGTH_MISMATCH),
+        ("42P17", SqlState::INVALID_OBJECT_DEFINITION),
+        ("22034", SqlState::MORE_THAN_ONE_SQL_JSON_ITEM),
+        ("HV000", SqlState::FDW_ERROR),
+        ("2200B", SqlState::ESCAPE_CHARACTER_CONFLICT),
+        ("HV008", SqlState::FDW_INVALID_COLUMN_NUMBER),
+        ("34000", SqlState::INVALID_CURSOR_NAME),
+        ("2201G", SqlState::INVALID_ARGUMENT_FOR_WIDTH_BUCKET_FUNCTION),
+        ("44000", SqlState::WITH_CHECK_OPTION_VIOLATION),
+        ("HV010", SqlState::FDW_FUNCTION_SEQUENCE_ERROR),
+        ("39004", SqlState::E_R_I_E_NULL_VALUE_NOT_ALLOWED),
+        ("22001", SqlState::STRING_DATA_RIGHT_TRUNCATION),
+        ("3D000", SqlState::INVALID_CATALOG_NAME),
+        ("25005", SqlState::NO_ACTIVE_SQL_TRANSACTION_FOR_BRANCH_TRANSACTION),
+        ("2200L", SqlState::NOT_AN_XML_DOCUMENT),
+        ("27000", SqlState::TRIGGERED_DATA_CHANGE_VIOLATION),
+        ("HV090", SqlState::FDW_INVALID_STRING_LENGTH_OR_BUFFER_LENGTH),
+        ("42939", SqlState::RESERVED_NAME),
+        ("58000", SqlState::SYSTEM_ERROR),
+        ("2200M", SqlState::INVALID_XML_DOCUMENT),
+        ("HV00L", SqlState::FDW_UNABLE_TO_CREATE_EXECUTION),
+        ("57014", SqlState::QUERY_CANCELED),
+        ("23502", SqlState::NOT_NULL_VIOLATION),
+        ("22002", SqlState::NULL_VALUE_NO_INDICATOR_PARAMETER),
+        ("HV00R", SqlState::FDW_TABLE_NOT_FOUND),
+        ("HV00P", SqlState::FDW_NO_SCHEMAS),
+        ("38003", SqlState::E_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED),
+        ("39000", SqlState::EXTERNAL_ROUTINE_INVOCATION_EXCEPTION),
+        ("22015", SqlState::INTERVAL_FIELD_OVERFLOW),
+        ("HV00K", SqlState::FDW_REPLY_HANDLE),
+        ("HV024", SqlState::FDW_INVALID_ATTRIBUTE_VALUE),
+        ("2200D", SqlState::INVALID_ESCAPE_OCTET),
+        ("08007", SqlState::TRANSACTION_RESOLUTION_UNKNOWN),
+        ("2F003", SqlState::S_R_E_PROHIBITED_SQL_STATEMENT_ATTEMPTED),
+        ("42725", SqlState::AMBIGUOUS_FUNCTION),
+        ("2203A", SqlState::SQL_JSON_MEMBER_NOT_FOUND),
+        ("42846", SqlState::CANNOT_COERCE),
+        ("42P04", SqlState::DUPLICATE_DATABASE),
+        ("42000", SqlState::SYNTAX_ERROR_OR_ACCESS_RULE_VIOLATION),
+        ("2203B", SqlState::SQL_JSON_NUMBER_NOT_FOUND),
+        ("42P05", SqlState::DUPLICATE_PSTATEMENT),
+        ("53300", SqlState::TOO_MANY_CONNECTIONS),
+        ("53400", SqlState::CONFIGURATION_LIMIT_EXCEEDED),
+        ("42704", SqlState::UNDEFINED_OBJECT),
+        ("2202G", SqlState::INVALID_TABLESAMPLE_REPEAT),
+        ("22023", SqlState::INVALID_PARAMETER_VALUE),
+        ("53000", SqlState::INSUFFICIENT_RESOURCES),
+    ],
+};
diff --git a/libs/proxy/tokio-postgres2/src/generic_client.rs b/libs/proxy/tokio-postgres2/src/generic_client.rs
new file mode 100644
index 0000000000..768213f8ed
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/generic_client.rs
@@ -0,0 +1,64 @@
+use crate::query::RowStream;
+use crate::types::Type;
+use crate::{Client, Error, Transaction};
+use async_trait::async_trait;
+use postgres_protocol2::Oid;
+
+mod private {
+    pub trait Sealed {}
+}
+
+/// A trait allowing abstraction over connections and transactions.
+///
+/// This trait is "sealed", and cannot be implemented outside of this crate.
+#[async_trait]
+pub trait GenericClient: private::Sealed {
+    /// Like `Client::query_raw_txt`.
+    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    where
+        S: AsRef<str> + Sync + Send,
+        I: IntoIterator<Item = Option<S>> + Sync + Send,
+        I::IntoIter: ExactSizeIterator + Sync + Send;
+
+    /// Query for type information
+    async fn get_type(&self, oid: Oid) -> Result<Type, Error>;
+}
+
+impl private::Sealed for Client {}
+
+#[async_trait]
+impl GenericClient for Client {
+    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    where
+        S: AsRef<str> + Sync + Send,
+        I: IntoIterator<Item = Option<S>> + Sync + Send,
+        I::IntoIter: ExactSizeIterator + Sync + Send,
+    {
+        self.query_raw_txt(statement, params).await
+    }
+
+    /// Query for type information
+    async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
+        self.get_type(oid).await
+    }
+}
+
+impl private::Sealed for Transaction<'_> {}
+
+#[async_trait]
+#[allow(clippy::needless_lifetimes)]
+impl GenericClient for Transaction<'_> {
+    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    where
+        S: AsRef<str> + Sync + Send,
+        I: IntoIterator<Item = Option<S>> + Sync + Send,
+        I::IntoIter: ExactSizeIterator + Sync + Send,
+    {
+        self.query_raw_txt(statement, params).await
+    }
+
+    /// Query for type information
+    async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
+        self.client().get_type(oid).await
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs
new file mode 100644
index 0000000000..72ba8172b2
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -0,0 +1,148 @@
+//! An asynchronous, pipelined, PostgreSQL client.
+#![warn(rust_2018_idioms, clippy::all, missing_docs)]
+
+pub use crate::cancel_token::CancelToken;
+pub use crate::client::Client;
+pub use crate::config::Config;
+pub use crate::connection::Connection;
+use crate::error::DbError;
+pub use crate::error::Error;
+pub use crate::generic_client::GenericClient;
+pub use crate::query::RowStream;
+pub use crate::row::{Row, SimpleQueryRow};
+pub use crate::simple_query::SimpleQueryStream;
+pub use crate::statement::{Column, Statement};
+use crate::tls::MakeTlsConnect;
+pub use crate::tls::NoTls;
+pub use crate::to_statement::ToStatement;
+pub use crate::transaction::Transaction;
+pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder};
+use crate::types::ToSql;
+use postgres_protocol2::message::backend::ReadyForQueryBody;
+use tokio::net::TcpStream;
+
+/// After executing a query, the connection will be in one of these states
+#[derive(Clone, Copy, Debug, PartialEq)]
+#[repr(u8)]
+pub enum ReadyForQueryStatus {
+    /// Connection state is unknown
+    Unknown,
+    /// Connection is idle (no transactions)
+    Idle = b'I',
+    /// Connection is in a transaction block
+    Transaction = b'T',
+    /// Connection is in a failed transaction block
+    FailedTransaction = b'E',
+}
+
+impl From<ReadyForQueryBody> for ReadyForQueryStatus {
+    fn from(value: ReadyForQueryBody) -> Self {
+        match value.status() {
+            b'I' => Self::Idle,
+            b'T' => Self::Transaction,
+            b'E' => Self::FailedTransaction,
+            _ => Self::Unknown,
+        }
+    }
+}
+
+mod cancel_query;
+mod cancel_query_raw;
+mod cancel_token;
+mod client;
+mod codec;
+pub mod config;
+mod connect;
+mod connect_raw;
+mod connect_socket;
+mod connect_tls;
+mod connection;
+pub mod error;
+mod generic_client;
+pub mod maybe_tls_stream;
+mod prepare;
+mod query;
+pub mod row;
+mod simple_query;
+mod statement;
+pub mod tls;
+mod to_statement;
+mod transaction;
+mod transaction_builder;
+pub mod types;
+
+/// A convenience function which parses a connection string and connects to the database.
+///
+/// See the documentation for [`Config`] for details on the connection string format.
+///
+/// Requires the `runtime` Cargo feature (enabled by default).
+///
+/// [`Config`]: config/struct.Config.html
+pub async fn connect<T>(
+    config: &str,
+    tls: T,
+) -> Result<(Client, Connection<TcpStream, T::Stream>), Error>
+where
+    T: MakeTlsConnect<TcpStream>,
+{
+    let config = config.parse::<Config>()?;
+    config.connect(tls).await
+}
+
+/// An asynchronous notification.
+#[derive(Clone, Debug)]
+pub struct Notification {
+    process_id: i32,
+    channel: String,
+    payload: String,
+}
+
+impl Notification {
+    /// The process ID of the notifying backend process.
+    pub fn process_id(&self) -> i32 {
+        self.process_id
+    }
+
+    /// The name of the channel that the notify has been raised on.
+    pub fn channel(&self) -> &str {
+        &self.channel
+    }
+
+    /// The "payload" string passed from the notifying process.
+    pub fn payload(&self) -> &str {
+        &self.payload
+    }
+}
+
+/// An asynchronous message from the server.
+#[allow(clippy::large_enum_variant)]
+#[derive(Debug, Clone)]
+#[non_exhaustive]
+pub enum AsyncMessage {
+    /// A notice.
+    ///
+    /// Notices use the same format as errors, but aren't "errors" per-se.
+    Notice(DbError),
+    /// A notification.
+    ///
+    /// Connections can subscribe to notifications with the `LISTEN` command.
+    Notification(Notification),
+}
+
+/// Message returned by the `SimpleQuery` stream.
+#[derive(Debug)]
+#[non_exhaustive]
+pub enum SimpleQueryMessage {
+    /// A row of data.
+    Row(SimpleQueryRow),
+    /// A statement in the query has completed.
+    ///
+    /// The number of rows modified or selected is returned.
+    CommandComplete(u64),
+}
+
+fn slice_iter<'a>(
+    s: &'a [&'a (dyn ToSql + Sync)],
+) -> impl ExactSizeIterator<Item = &'a (dyn ToSql + Sync)> + 'a {
+    s.iter().map(|s| *s as _)
+}
diff --git a/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs b/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs
new file mode 100644
index 0000000000..9a7e248997
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/maybe_tls_stream.rs
@@ -0,0 +1,77 @@
+//! MaybeTlsStream.
+//!
+//! Represents a stream that may or may not be encrypted with TLS.
+use crate::tls::{ChannelBinding, TlsStream};
+use std::io;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
+
+/// A stream that may or may not be encrypted with TLS.
+pub enum MaybeTlsStream<S, T> {
+    /// An unencrypted stream.
+    Raw(S),
+    /// An encrypted stream.
+    Tls(T),
+}
+
+impl<S, T> AsyncRead for MaybeTlsStream<S, T>
+where
+    S: AsyncRead + Unpin,
+    T: AsyncRead + Unpin,
+{
+    fn poll_read(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &mut ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        match &mut *self {
+            MaybeTlsStream::Raw(s) => Pin::new(s).poll_read(cx, buf),
+            MaybeTlsStream::Tls(s) => Pin::new(s).poll_read(cx, buf),
+        }
+    }
+}
+
+impl<S, T> AsyncWrite for MaybeTlsStream<S, T>
+where
+    S: AsyncWrite + Unpin,
+    T: AsyncWrite + Unpin,
+{
+    fn poll_write(
+        mut self: Pin<&mut Self>,
+        cx: &mut Context<'_>,
+        buf: &[u8],
+    ) -> Poll<io::Result<usize>> {
+        match &mut *self {
+            MaybeTlsStream::Raw(s) => Pin::new(s).poll_write(cx, buf),
+            MaybeTlsStream::Tls(s) => Pin::new(s).poll_write(cx, buf),
+        }
+    }
+
+    fn poll_flush(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        match &mut *self {
+            MaybeTlsStream::Raw(s) => Pin::new(s).poll_flush(cx),
+            MaybeTlsStream::Tls(s) => Pin::new(s).poll_flush(cx),
+        }
+    }
+
+    fn poll_shutdown(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
+        match &mut *self {
+            MaybeTlsStream::Raw(s) => Pin::new(s).poll_shutdown(cx),
+            MaybeTlsStream::Tls(s) => Pin::new(s).poll_shutdown(cx),
+        }
+    }
+}
+
+impl<S, T> TlsStream for MaybeTlsStream<S, T>
+where
+    S: AsyncRead + AsyncWrite + Unpin,
+    T: TlsStream + Unpin,
+{
+    fn channel_binding(&self) -> ChannelBinding {
+        match self {
+            MaybeTlsStream::Raw(_) => ChannelBinding::none(),
+            MaybeTlsStream::Tls(s) => s.channel_binding(),
+        }
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/prepare.rs b/libs/proxy/tokio-postgres2/src/prepare.rs
new file mode 100644
index 0000000000..da0c755c5b
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/prepare.rs
@@ -0,0 +1,262 @@
+use crate::client::InnerClient;
+use crate::codec::FrontendMessage;
+use crate::connection::RequestMessages;
+use crate::error::SqlState;
+use crate::types::{Field, Kind, Oid, Type};
+use crate::{query, slice_iter};
+use crate::{Column, Error, Statement};
+use bytes::Bytes;
+use fallible_iterator::FallibleIterator;
+use futures_util::{pin_mut, TryStreamExt};
+use log::debug;
+use postgres_protocol2::message::backend::Message;
+use postgres_protocol2::message::frontend;
+use std::future::Future;
+use std::pin::Pin;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::Arc;
+
+pub(crate) const TYPEINFO_QUERY: &str = "\
+SELECT t.typname, t.typtype, t.typelem, r.rngsubtype, t.typbasetype, n.nspname, t.typrelid
+FROM pg_catalog.pg_type t
+LEFT OUTER JOIN pg_catalog.pg_range r ON r.rngtypid = t.oid
+INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
+WHERE t.oid = $1
+";
+
+// Range types weren't added until Postgres 9.2, so pg_range may not exist
+const TYPEINFO_FALLBACK_QUERY: &str = "\
+SELECT t.typname, t.typtype, t.typelem, NULL::OID, t.typbasetype, n.nspname, t.typrelid
+FROM pg_catalog.pg_type t
+INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
+WHERE t.oid = $1
+";
+
+const TYPEINFO_ENUM_QUERY: &str = "\
+SELECT enumlabel
+FROM pg_catalog.pg_enum
+WHERE enumtypid = $1
+ORDER BY enumsortorder
+";
+
+// Postgres 9.0 didn't have enumsortorder
+const TYPEINFO_ENUM_FALLBACK_QUERY: &str = "\
+SELECT enumlabel
+FROM pg_catalog.pg_enum
+WHERE enumtypid = $1
+ORDER BY oid
+";
+
+pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\
+SELECT attname, atttypid
+FROM pg_catalog.pg_attribute
+WHERE attrelid = $1
+AND NOT attisdropped
+AND attnum > 0
+ORDER BY attnum
+";
+
+static NEXT_ID: AtomicUsize = AtomicUsize::new(0);
+
+pub async fn prepare(
+    client: &Arc<InnerClient>,
+    query: &str,
+    types: &[Type],
+) -> Result<Statement, Error> {
+    let name = format!("s{}", NEXT_ID.fetch_add(1, Ordering::SeqCst));
+    let buf = encode(client, &name, query, types)?;
+    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
+
+    match responses.next().await? {
+        Message::ParseComplete => {}
+        _ => return Err(Error::unexpected_message()),
+    }
+
+    let parameter_description = match responses.next().await? {
+        Message::ParameterDescription(body) => body,
+        _ => return Err(Error::unexpected_message()),
+    };
+
+    let row_description = match responses.next().await? {
+        Message::RowDescription(body) => Some(body),
+        Message::NoData => None,
+        _ => return Err(Error::unexpected_message()),
+    };
+
+    let mut parameters = vec![];
+    let mut it = parameter_description.parameters();
+    while let Some(oid) = it.next().map_err(Error::parse)? {
+        let type_ = get_type(client, oid).await?;
+        parameters.push(type_);
+    }
+
+    let mut columns = vec![];
+    if let Some(row_description) = row_description {
+        let mut it = row_description.fields();
+        while let Some(field) = it.next().map_err(Error::parse)? {
+            let type_ = get_type(client, field.type_oid()).await?;
+            let column = Column::new(field.name().to_string(), type_, field);
+            columns.push(column);
+        }
+    }
+
+    Ok(Statement::new(client, name, parameters, columns))
+}
+
+fn prepare_rec<'a>(
+    client: &'a Arc<InnerClient>,
+    query: &'a str,
+    types: &'a [Type],
+) -> Pin<Box<dyn Future<Output = Result<Statement, Error>> + 'a + Send>> {
+    Box::pin(prepare(client, query, types))
+}
+
+fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result<Bytes, Error> {
+    if types.is_empty() {
+        debug!("preparing query {}: {}", name, query);
+    } else {
+        debug!("preparing query {} with types {:?}: {}", name, types, query);
+    }
+
+    client.with_buf(|buf| {
+        frontend::parse(name, query, types.iter().map(Type::oid), buf).map_err(Error::encode)?;
+        frontend::describe(b'S', name, buf).map_err(Error::encode)?;
+        frontend::sync(buf);
+        Ok(buf.split().freeze())
+    })
+}
+
+pub async fn get_type(client: &Arc<InnerClient>, oid: Oid) -> Result<Type, Error> {
+    if let Some(type_) = Type::from_oid(oid) {
+        return Ok(type_);
+    }
+
+    if let Some(type_) = client.type_(oid) {
+        return Ok(type_);
+    }
+
+    let stmt = typeinfo_statement(client).await?;
+
+    let rows = query::query(client, stmt, slice_iter(&[&oid])).await?;
+    pin_mut!(rows);
+
+    let row = match rows.try_next().await? {
+        Some(row) => row,
+        None => return Err(Error::unexpected_message()),
+    };
+
+    let name: String = row.try_get(0)?;
+    let type_: i8 = row.try_get(1)?;
+    let elem_oid: Oid = row.try_get(2)?;
+    let rngsubtype: Option<Oid> = row.try_get(3)?;
+    let basetype: Oid = row.try_get(4)?;
+    let schema: String = row.try_get(5)?;
+    let relid: Oid = row.try_get(6)?;
+
+    let kind = if type_ == b'e' as i8 {
+        let variants = get_enum_variants(client, oid).await?;
+        Kind::Enum(variants)
+    } else if type_ == b'p' as i8 {
+        Kind::Pseudo
+    } else if basetype != 0 {
+        let type_ = get_type_rec(client, basetype).await?;
+        Kind::Domain(type_)
+    } else if elem_oid != 0 {
+        let type_ = get_type_rec(client, elem_oid).await?;
+        Kind::Array(type_)
+    } else if relid != 0 {
+        let fields = get_composite_fields(client, relid).await?;
+        Kind::Composite(fields)
+    } else if let Some(rngsubtype) = rngsubtype {
+        let type_ = get_type_rec(client, rngsubtype).await?;
+        Kind::Range(type_)
+    } else {
+        Kind::Simple
+    };
+
+    let type_ = Type::new(name, oid, kind, schema);
+    client.set_type(oid, &type_);
+
+    Ok(type_)
+}
+
+fn get_type_rec<'a>(
+    client: &'a Arc<InnerClient>,
+    oid: Oid,
+) -> Pin<Box<dyn Future<Output = Result<Type, Error>> + Send + 'a>> {
+    Box::pin(get_type(client, oid))
+}
+
+async fn typeinfo_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
+    if let Some(stmt) = client.typeinfo() {
+        return Ok(stmt);
+    }
+
+    let stmt = match prepare_rec(client, TYPEINFO_QUERY, &[]).await {
+        Ok(stmt) => stmt,
+        Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_TABLE) => {
+            prepare_rec(client, TYPEINFO_FALLBACK_QUERY, &[]).await?
+        }
+        Err(e) => return Err(e),
+    };
+
+    client.set_typeinfo(&stmt);
+    Ok(stmt)
+}
+
+async fn get_enum_variants(client: &Arc<InnerClient>, oid: Oid) -> Result<Vec<String>, Error> {
+    let stmt = typeinfo_enum_statement(client).await?;
+
+    query::query(client, stmt, slice_iter(&[&oid]))
+        .await?
+        .and_then(|row| async move { row.try_get(0) })
+        .try_collect()
+        .await
+}
+
+async fn typeinfo_enum_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
+    if let Some(stmt) = client.typeinfo_enum() {
+        return Ok(stmt);
+    }
+
+    let stmt = match prepare_rec(client, TYPEINFO_ENUM_QUERY, &[]).await {
+        Ok(stmt) => stmt,
+        Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_COLUMN) => {
+            prepare_rec(client, TYPEINFO_ENUM_FALLBACK_QUERY, &[]).await?
+        }
+        Err(e) => return Err(e),
+    };
+
+    client.set_typeinfo_enum(&stmt);
+    Ok(stmt)
+}
+
+async fn get_composite_fields(client: &Arc<InnerClient>, oid: Oid) -> Result<Vec<Field>, Error> {
+    let stmt = typeinfo_composite_statement(client).await?;
+
+    let rows = query::query(client, stmt, slice_iter(&[&oid]))
+        .await?
+        .try_collect::<Vec<_>>()
+        .await?;
+
+    let mut fields = vec![];
+    for row in rows {
+        let name = row.try_get(0)?;
+        let oid = row.try_get(1)?;
+        let type_ = get_type_rec(client, oid).await?;
+        fields.push(Field::new(name, type_));
+    }
+
+    Ok(fields)
+}
+
+async fn typeinfo_composite_statement(client: &Arc<InnerClient>) -> Result<Statement, Error> {
+    if let Some(stmt) = client.typeinfo_composite() {
+        return Ok(stmt);
+    }
+
+    let stmt = prepare_rec(client, TYPEINFO_COMPOSITE_QUERY, &[]).await?;
+
+    client.set_typeinfo_composite(&stmt);
+    Ok(stmt)
+}
diff --git a/libs/proxy/tokio-postgres2/src/query.rs b/libs/proxy/tokio-postgres2/src/query.rs
new file mode 100644
index 0000000000..534195a707
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/query.rs
@@ -0,0 +1,340 @@
+use crate::client::{InnerClient, Responses};
+use crate::codec::FrontendMessage;
+use crate::connection::RequestMessages;
+use crate::types::IsNull;
+use crate::{Column, Error, ReadyForQueryStatus, Row, Statement};
+use bytes::{BufMut, Bytes, BytesMut};
+use fallible_iterator::FallibleIterator;
+use futures_util::{ready, Stream};
+use log::{debug, log_enabled, Level};
+use pin_project_lite::pin_project;
+use postgres_protocol2::message::backend::Message;
+use postgres_protocol2::message::frontend;
+use postgres_types2::{Format, ToSql, Type};
+use std::fmt;
+use std::marker::PhantomPinned;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+struct BorrowToSqlParamsDebug<'a>(&'a [&'a (dyn ToSql + Sync)]);
+
+impl fmt::Debug for BorrowToSqlParamsDebug<'_> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_list().entries(self.0.iter()).finish()
+    }
+}
+
+pub async fn query<'a, I>(
+    client: &InnerClient,
+    statement: Statement,
+    params: I,
+) -> Result<RowStream, Error>
+where
+    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
+    I::IntoIter: ExactSizeIterator,
+{
+    let buf = if log_enabled!(Level::Debug) {
+        let params = params.into_iter().collect::<Vec<_>>();
+        debug!(
+            "executing statement {} with parameters: {:?}",
+            statement.name(),
+            BorrowToSqlParamsDebug(params.as_slice()),
+        );
+        encode(client, &statement, params)?
+    } else {
+        encode(client, &statement, params)?
+    };
+    let responses = start(client, buf).await?;
+    Ok(RowStream {
+        statement,
+        responses,
+        command_tag: None,
+        status: ReadyForQueryStatus::Unknown,
+        output_format: Format::Binary,
+        _p: PhantomPinned,
+    })
+}
+
+pub async fn query_txt<S, I>(
+    client: &Arc<InnerClient>,
+    query: &str,
+    params: I,
+) -> Result<RowStream, Error>
+where
+    S: AsRef<str>,
+    I: IntoIterator<Item = Option<S>>,
+    I::IntoIter: ExactSizeIterator,
+{
+    let params = params.into_iter();
+
+    let buf = client.with_buf(|buf| {
+        frontend::parse(
+            "",                 // unnamed prepared statement
+            query,              // query to parse
+            std::iter::empty(), // give no type info
+            buf,
+        )
+        .map_err(Error::encode)?;
+        frontend::describe(b'S', "", buf).map_err(Error::encode)?;
+        // Bind, pass params as text, retrieve as binary
+        match frontend::bind(
+            "",                 // empty string selects the unnamed portal
+            "",                 // unnamed prepared statement
+            std::iter::empty(), // all parameters use the default format (text)
+            params,
+            |param, buf| match param {
+                Some(param) => {
+                    buf.put_slice(param.as_ref().as_bytes());
+                    Ok(postgres_protocol2::IsNull::No)
+                }
+                None => Ok(postgres_protocol2::IsNull::Yes),
+            },
+            Some(0), // all text
+            buf,
+        ) {
+            Ok(()) => Ok(()),
+            Err(frontend::BindError::Conversion(e)) => Err(Error::to_sql(e, 0)),
+            Err(frontend::BindError::Serialization(e)) => Err(Error::encode(e)),
+        }?;
+
+        // Execute
+        frontend::execute("", 0, buf).map_err(Error::encode)?;
+        // Sync
+        frontend::sync(buf);
+
+        Ok(buf.split().freeze())
+    })?;
+
+    // now read the responses
+    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
+
+    match responses.next().await? {
+        Message::ParseComplete => {}
+        _ => return Err(Error::unexpected_message()),
+    }
+
+    let parameter_description = match responses.next().await? {
+        Message::ParameterDescription(body) => body,
+        _ => return Err(Error::unexpected_message()),
+    };
+
+    let row_description = match responses.next().await? {
+        Message::RowDescription(body) => Some(body),
+        Message::NoData => None,
+        _ => return Err(Error::unexpected_message()),
+    };
+
+    match responses.next().await? {
+        Message::BindComplete => {}
+        _ => return Err(Error::unexpected_message()),
+    }
+
+    let mut parameters = vec![];
+    let mut it = parameter_description.parameters();
+    while let Some(oid) = it.next().map_err(Error::parse)? {
+        let type_ = Type::from_oid(oid).unwrap_or(Type::UNKNOWN);
+        parameters.push(type_);
+    }
+
+    let mut columns = vec![];
+    if let Some(row_description) = row_description {
+        let mut it = row_description.fields();
+        while let Some(field) = it.next().map_err(Error::parse)? {
+            let type_ = Type::from_oid(field.type_oid()).unwrap_or(Type::UNKNOWN);
+            let column = Column::new(field.name().to_string(), type_, field);
+            columns.push(column);
+        }
+    }
+
+    Ok(RowStream {
+        statement: Statement::new_anonymous(parameters, columns),
+        responses,
+        command_tag: None,
+        status: ReadyForQueryStatus::Unknown,
+        output_format: Format::Text,
+        _p: PhantomPinned,
+    })
+}
+
+pub async fn execute<'a, I>(
+    client: &InnerClient,
+    statement: Statement,
+    params: I,
+) -> Result<u64, Error>
+where
+    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
+    I::IntoIter: ExactSizeIterator,
+{
+    let buf = if log_enabled!(Level::Debug) {
+        let params = params.into_iter().collect::<Vec<_>>();
+        debug!(
+            "executing statement {} with parameters: {:?}",
+            statement.name(),
+            BorrowToSqlParamsDebug(params.as_slice()),
+        );
+        encode(client, &statement, params)?
+    } else {
+        encode(client, &statement, params)?
+    };
+    let mut responses = start(client, buf).await?;
+
+    let mut rows = 0;
+    loop {
+        match responses.next().await? {
+            Message::DataRow(_) => {}
+            Message::CommandComplete(body) => {
+                rows = body
+                    .tag()
+                    .map_err(Error::parse)?
+                    .rsplit(' ')
+                    .next()
+                    .unwrap()
+                    .parse()
+                    .unwrap_or(0);
+            }
+            Message::EmptyQueryResponse => rows = 0,
+            Message::ReadyForQuery(_) => return Ok(rows),
+            _ => return Err(Error::unexpected_message()),
+        }
+    }
+}
+
+async fn start(client: &InnerClient, buf: Bytes) -> Result<Responses, Error> {
+    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
+
+    match responses.next().await? {
+        Message::BindComplete => {}
+        _ => return Err(Error::unexpected_message()),
+    }
+
+    Ok(responses)
+}
+
+pub fn encode<'a, I>(client: &InnerClient, statement: &Statement, params: I) -> Result<Bytes, Error>
+where
+    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
+    I::IntoIter: ExactSizeIterator,
+{
+    client.with_buf(|buf| {
+        encode_bind(statement, params, "", buf)?;
+        frontend::execute("", 0, buf).map_err(Error::encode)?;
+        frontend::sync(buf);
+        Ok(buf.split().freeze())
+    })
+}
+
+pub fn encode_bind<'a, I>(
+    statement: &Statement,
+    params: I,
+    portal: &str,
+    buf: &mut BytesMut,
+) -> Result<(), Error>
+where
+    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
+    I::IntoIter: ExactSizeIterator,
+{
+    let param_types = statement.params();
+    let params = params.into_iter();
+
+    assert!(
+        param_types.len() == params.len(),
+        "expected {} parameters but got {}",
+        param_types.len(),
+        params.len()
+    );
+
+    let (param_formats, params): (Vec<_>, Vec<_>) = params
+        .zip(param_types.iter())
+        .map(|(p, ty)| (p.encode_format(ty) as i16, p))
+        .unzip();
+
+    let params = params.into_iter();
+
+    let mut error_idx = 0;
+    let r = frontend::bind(
+        portal,
+        statement.name(),
+        param_formats,
+        params.zip(param_types).enumerate(),
+        |(idx, (param, ty)), buf| match param.to_sql_checked(ty, buf) {
+            Ok(IsNull::No) => Ok(postgres_protocol2::IsNull::No),
+            Ok(IsNull::Yes) => Ok(postgres_protocol2::IsNull::Yes),
+            Err(e) => {
+                error_idx = idx;
+                Err(e)
+            }
+        },
+        Some(1),
+        buf,
+    );
+    match r {
+        Ok(()) => Ok(()),
+        Err(frontend::BindError::Conversion(e)) => Err(Error::to_sql(e, error_idx)),
+        Err(frontend::BindError::Serialization(e)) => Err(Error::encode(e)),
+    }
+}
+
+pin_project! {
+    /// A stream of table rows.
+    pub struct RowStream {
+        statement: Statement,
+        responses: Responses,
+        command_tag: Option<String>,
+        output_format: Format,
+        status: ReadyForQueryStatus,
+        #[pin]
+        _p: PhantomPinned,
+    }
+}
+
+impl Stream for RowStream {
+    type Item = Result<Row, Error>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.project();
+        loop {
+            match ready!(this.responses.poll_next(cx)?) {
+                Message::DataRow(body) => {
+                    return Poll::Ready(Some(Ok(Row::new(
+                        this.statement.clone(),
+                        body,
+                        *this.output_format,
+                    )?)))
+                }
+                Message::EmptyQueryResponse | Message::PortalSuspended => {}
+                Message::CommandComplete(body) => {
+                    if let Ok(tag) = body.tag() {
+                        *this.command_tag = Some(tag.to_string());
+                    }
+                }
+                Message::ReadyForQuery(status) => {
+                    *this.status = status.into();
+                    return Poll::Ready(None);
+                }
+                _ => return Poll::Ready(Some(Err(Error::unexpected_message()))),
+            }
+        }
+    }
+}
+
+impl RowStream {
+    /// Returns information about the columns of data in the row.
+    pub fn columns(&self) -> &[Column] {
+        self.statement.columns()
+    }
+
+    /// Returns the command tag of this query.
+    ///
+    /// This is only available after the stream has been exhausted.
+    pub fn command_tag(&self) -> Option<String> {
+        self.command_tag.clone()
+    }
+
+    /// Returns if the connection is ready for querying, with the status of the connection.
+    ///
+    /// This might be available only after the stream has been exhausted.
+    pub fn ready_status(&self) -> ReadyForQueryStatus {
+        self.status
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/row.rs b/libs/proxy/tokio-postgres2/src/row.rs
new file mode 100644
index 0000000000..10e130707d
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/row.rs
@@ -0,0 +1,300 @@
+//! Rows.
+
+use crate::row::sealed::{AsName, Sealed};
+use crate::simple_query::SimpleColumn;
+use crate::statement::Column;
+use crate::types::{FromSql, Type, WrongType};
+use crate::{Error, Statement};
+use fallible_iterator::FallibleIterator;
+use postgres_protocol2::message::backend::DataRowBody;
+use postgres_types2::{Format, WrongFormat};
+use std::fmt;
+use std::ops::Range;
+use std::str;
+use std::sync::Arc;
+
+mod sealed {
+    pub trait Sealed {}
+
+    pub trait AsName {
+        fn as_name(&self) -> &str;
+    }
+}
+
+impl AsName for Column {
+    fn as_name(&self) -> &str {
+        self.name()
+    }
+}
+
+impl AsName for String {
+    fn as_name(&self) -> &str {
+        self
+    }
+}
+
+/// A trait implemented by types that can index into columns of a row.
+///
+/// This cannot be implemented outside of this crate.
+pub trait RowIndex: Sealed {
+    #[doc(hidden)]
+    fn __idx<T>(&self, columns: &[T]) -> Option<usize>
+    where
+        T: AsName;
+}
+
+impl Sealed for usize {}
+
+impl RowIndex for usize {
+    #[inline]
+    fn __idx<T>(&self, columns: &[T]) -> Option<usize>
+    where
+        T: AsName,
+    {
+        if *self >= columns.len() {
+            None
+        } else {
+            Some(*self)
+        }
+    }
+}
+
+impl Sealed for str {}
+
+impl RowIndex for str {
+    #[inline]
+    fn __idx<T>(&self, columns: &[T]) -> Option<usize>
+    where
+        T: AsName,
+    {
+        if let Some(idx) = columns.iter().position(|d| d.as_name() == self) {
+            return Some(idx);
+        };
+
+        // FIXME ASCII-only case insensitivity isn't really the right thing to
+        // do. Postgres itself uses a dubious wrapper around tolower and JDBC
+        // uses the US locale.
+        columns
+            .iter()
+            .position(|d| d.as_name().eq_ignore_ascii_case(self))
+    }
+}
+
+impl<T> Sealed for &T where T: ?Sized + Sealed {}
+
+impl<T> RowIndex for &T
+where
+    T: ?Sized + RowIndex,
+{
+    #[inline]
+    fn __idx<U>(&self, columns: &[U]) -> Option<usize>
+    where
+        U: AsName,
+    {
+        T::__idx(*self, columns)
+    }
+}
+
+/// A row of data returned from the database by a query.
+pub struct Row {
+    statement: Statement,
+    output_format: Format,
+    body: DataRowBody,
+    ranges: Vec<Option<Range<usize>>>,
+}
+
+impl fmt::Debug for Row {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("Row")
+            .field("columns", &self.columns())
+            .finish()
+    }
+}
+
+impl Row {
+    pub(crate) fn new(
+        statement: Statement,
+        body: DataRowBody,
+        output_format: Format,
+    ) -> Result<Row, Error> {
+        let ranges = body.ranges().collect().map_err(Error::parse)?;
+        Ok(Row {
+            statement,
+            body,
+            ranges,
+            output_format,
+        })
+    }
+
+    /// Returns information about the columns of data in the row.
+    pub fn columns(&self) -> &[Column] {
+        self.statement.columns()
+    }
+
+    /// Determines if the row contains no values.
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Returns the number of values in the row.
+    pub fn len(&self) -> usize {
+        self.columns().len()
+    }
+
+    /// Deserializes a value from the row.
+    ///
+    /// The value can be specified either by its numeric index in the row, or by its column name.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the index is out of bounds or if the value cannot be converted to the specified type.
+    pub fn get<'a, I, T>(&'a self, idx: I) -> T
+    where
+        I: RowIndex + fmt::Display,
+        T: FromSql<'a>,
+    {
+        match self.get_inner(&idx) {
+            Ok(ok) => ok,
+            Err(err) => panic!("error retrieving column {}: {}", idx, err),
+        }
+    }
+
+    /// Like `Row::get`, but returns a `Result` rather than panicking.
+    pub fn try_get<'a, I, T>(&'a self, idx: I) -> Result<T, Error>
+    where
+        I: RowIndex + fmt::Display,
+        T: FromSql<'a>,
+    {
+        self.get_inner(&idx)
+    }
+
+    fn get_inner<'a, I, T>(&'a self, idx: &I) -> Result<T, Error>
+    where
+        I: RowIndex + fmt::Display,
+        T: FromSql<'a>,
+    {
+        let idx = match idx.__idx(self.columns()) {
+            Some(idx) => idx,
+            None => return Err(Error::column(idx.to_string())),
+        };
+
+        let ty = self.columns()[idx].type_();
+        if !T::accepts(ty) {
+            return Err(Error::from_sql(
+                Box::new(WrongType::new::<T>(ty.clone())),
+                idx,
+            ));
+        }
+
+        FromSql::from_sql_nullable(ty, self.col_buffer(idx)).map_err(|e| Error::from_sql(e, idx))
+    }
+
+    /// Get the raw bytes for the column at the given index.
+    fn col_buffer(&self, idx: usize) -> Option<&[u8]> {
+        let range = self.ranges.get(idx)?.to_owned()?;
+        Some(&self.body.buffer()[range])
+    }
+
+    /// Interpret the column at the given index as text
+    ///
+    /// Useful when using query_raw_txt() which sets text transfer mode
+    pub fn as_text(&self, idx: usize) -> Result<Option<&str>, Error> {
+        if self.output_format == Format::Text {
+            match self.col_buffer(idx) {
+                Some(raw) => {
+                    FromSql::from_sql(&Type::TEXT, raw).map_err(|e| Error::from_sql(e, idx))
+                }
+                None => Ok(None),
+            }
+        } else {
+            Err(Error::from_sql(Box::new(WrongFormat {}), idx))
+        }
+    }
+
+    /// Row byte size
+    pub fn body_len(&self) -> usize {
+        self.body.buffer().len()
+    }
+}
+
+impl AsName for SimpleColumn {
+    fn as_name(&self) -> &str {
+        self.name()
+    }
+}
+
+/// A row of data returned from the database by a simple query.
+#[derive(Debug)]
+pub struct SimpleQueryRow {
+    columns: Arc<[SimpleColumn]>,
+    body: DataRowBody,
+    ranges: Vec<Option<Range<usize>>>,
+}
+
+impl SimpleQueryRow {
+    #[allow(clippy::new_ret_no_self)]
+    pub(crate) fn new(
+        columns: Arc<[SimpleColumn]>,
+        body: DataRowBody,
+    ) -> Result<SimpleQueryRow, Error> {
+        let ranges = body.ranges().collect().map_err(Error::parse)?;
+        Ok(SimpleQueryRow {
+            columns,
+            body,
+            ranges,
+        })
+    }
+
+    /// Returns information about the columns of data in the row.
+    pub fn columns(&self) -> &[SimpleColumn] {
+        &self.columns
+    }
+
+    /// Determines if the row contains no values.
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Returns the number of values in the row.
+    pub fn len(&self) -> usize {
+        self.columns.len()
+    }
+
+    /// Returns a value from the row.
+    ///
+    /// The value can be specified either by its numeric index in the row, or by its column name.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the index is out of bounds or if the value cannot be converted to the specified type.
+    pub fn get<I>(&self, idx: I) -> Option<&str>
+    where
+        I: RowIndex + fmt::Display,
+    {
+        match self.get_inner(&idx) {
+            Ok(ok) => ok,
+            Err(err) => panic!("error retrieving column {}: {}", idx, err),
+        }
+    }
+
+    /// Like `SimpleQueryRow::get`, but returns a `Result` rather than panicking.
+    pub fn try_get<I>(&self, idx: I) -> Result<Option<&str>, Error>
+    where
+        I: RowIndex + fmt::Display,
+    {
+        self.get_inner(&idx)
+    }
+
+    fn get_inner<I>(&self, idx: &I) -> Result<Option<&str>, Error>
+    where
+        I: RowIndex + fmt::Display,
+    {
+        let idx = match idx.__idx(&self.columns) {
+            Some(idx) => idx,
+            None => return Err(Error::column(idx.to_string())),
+        };
+
+        let buf = self.ranges[idx].clone().map(|r| &self.body.buffer()[r]);
+        FromSql::from_sql_nullable(&Type::TEXT, buf).map_err(|e| Error::from_sql(e, idx))
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/simple_query.rs b/libs/proxy/tokio-postgres2/src/simple_query.rs
new file mode 100644
index 0000000000..fb2550377b
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/simple_query.rs
@@ -0,0 +1,142 @@
+use crate::client::{InnerClient, Responses};
+use crate::codec::FrontendMessage;
+use crate::connection::RequestMessages;
+use crate::{Error, ReadyForQueryStatus, SimpleQueryMessage, SimpleQueryRow};
+use bytes::Bytes;
+use fallible_iterator::FallibleIterator;
+use futures_util::{ready, Stream};
+use log::debug;
+use pin_project_lite::pin_project;
+use postgres_protocol2::message::backend::Message;
+use postgres_protocol2::message::frontend;
+use std::marker::PhantomPinned;
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{Context, Poll};
+
+/// Information about a column of a single query row.
+#[derive(Debug)]
+pub struct SimpleColumn {
+    name: String,
+}
+
+impl SimpleColumn {
+    pub(crate) fn new(name: String) -> SimpleColumn {
+        SimpleColumn { name }
+    }
+
+    /// Returns the name of the column.
+    pub fn name(&self) -> &str {
+        &self.name
+    }
+}
+
+pub async fn simple_query(client: &InnerClient, query: &str) -> Result<SimpleQueryStream, Error> {
+    debug!("executing simple query: {}", query);
+
+    let buf = encode(client, query)?;
+    let responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
+
+    Ok(SimpleQueryStream {
+        responses,
+        columns: None,
+        status: ReadyForQueryStatus::Unknown,
+        _p: PhantomPinned,
+    })
+}
+
+pub async fn batch_execute(
+    client: &InnerClient,
+    query: &str,
+) -> Result<ReadyForQueryStatus, Error> {
+    debug!("executing statement batch: {}", query);
+
+    let buf = encode(client, query)?;
+    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;
+
+    loop {
+        match responses.next().await? {
+            Message::ReadyForQuery(status) => return Ok(status.into()),
+            Message::CommandComplete(_)
+            | Message::EmptyQueryResponse
+            | Message::RowDescription(_)
+            | Message::DataRow(_) => {}
+            _ => return Err(Error::unexpected_message()),
+        }
+    }
+}
+
+pub(crate) fn encode(client: &InnerClient, query: &str) -> Result<Bytes, Error> {
+    client.with_buf(|buf| {
+        frontend::query(query, buf).map_err(Error::encode)?;
+        Ok(buf.split().freeze())
+    })
+}
+
+pin_project! {
+    /// A stream of simple query results.
+    pub struct SimpleQueryStream {
+        responses: Responses,
+        columns: Option<Arc<[SimpleColumn]>>,
+        status: ReadyForQueryStatus,
+        #[pin]
+        _p: PhantomPinned,
+    }
+}
+
+impl SimpleQueryStream {
+    /// Returns if the connection is ready for querying, with the status of the connection.
+    ///
+    /// This might be available only after the stream has been exhausted.
+    pub fn ready_status(&self) -> ReadyForQueryStatus {
+        self.status
+    }
+}
+
+impl Stream for SimpleQueryStream {
+    type Item = Result<SimpleQueryMessage, Error>;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.project();
+        loop {
+            match ready!(this.responses.poll_next(cx)?) {
+                Message::CommandComplete(body) => {
+                    let rows = body
+                        .tag()
+                        .map_err(Error::parse)?
+                        .rsplit(' ')
+                        .next()
+                        .unwrap()
+                        .parse()
+                        .unwrap_or(0);
+                    return Poll::Ready(Some(Ok(SimpleQueryMessage::CommandComplete(rows))));
+                }
+                Message::EmptyQueryResponse => {
+                    return Poll::Ready(Some(Ok(SimpleQueryMessage::CommandComplete(0))));
+                }
+                Message::RowDescription(body) => {
+                    let columns = body
+                        .fields()
+                        .map(|f| Ok(SimpleColumn::new(f.name().to_string())))
+                        .collect::<Vec<_>>()
+                        .map_err(Error::parse)?
+                        .into();
+
+                    *this.columns = Some(columns);
+                }
+                Message::DataRow(body) => {
+                    let row = match &this.columns {
+                        Some(columns) => SimpleQueryRow::new(columns.clone(), body)?,
+                        None => return Poll::Ready(Some(Err(Error::unexpected_message()))),
+                    };
+                    return Poll::Ready(Some(Ok(SimpleQueryMessage::Row(row))));
+                }
+                Message::ReadyForQuery(s) => {
+                    *this.status = s.into();
+                    return Poll::Ready(None);
+                }
+                _ => return Poll::Ready(Some(Err(Error::unexpected_message()))),
+            }
+        }
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/statement.rs b/libs/proxy/tokio-postgres2/src/statement.rs
new file mode 100644
index 0000000000..22e160fc05
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/statement.rs
@@ -0,0 +1,157 @@
+use crate::client::InnerClient;
+use crate::codec::FrontendMessage;
+use crate::connection::RequestMessages;
+use crate::types::Type;
+use postgres_protocol2::{
+    message::{backend::Field, frontend},
+    Oid,
+};
+use std::{
+    fmt,
+    sync::{Arc, Weak},
+};
+
+struct StatementInner {
+    client: Weak<InnerClient>,
+    name: String,
+    params: Vec<Type>,
+    columns: Vec<Column>,
+}
+
+impl Drop for StatementInner {
+    fn drop(&mut self) {
+        if let Some(client) = self.client.upgrade() {
+            let buf = client.with_buf(|buf| {
+                frontend::close(b'S', &self.name, buf).unwrap();
+                frontend::sync(buf);
+                buf.split().freeze()
+            });
+            let _ = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)));
+        }
+    }
+}
+
+/// A prepared statement.
+///
+/// Prepared statements can only be used with the connection that created them.
+#[derive(Clone)]
+pub struct Statement(Arc<StatementInner>);
+
+impl Statement {
+    pub(crate) fn new(
+        inner: &Arc<InnerClient>,
+        name: String,
+        params: Vec<Type>,
+        columns: Vec<Column>,
+    ) -> Statement {
+        Statement(Arc::new(StatementInner {
+            client: Arc::downgrade(inner),
+            name,
+            params,
+            columns,
+        }))
+    }
+
+    pub(crate) fn new_anonymous(params: Vec<Type>, columns: Vec<Column>) -> Statement {
+        Statement(Arc::new(StatementInner {
+            client: Weak::new(),
+            name: String::new(),
+            params,
+            columns,
+        }))
+    }
+
+    pub(crate) fn name(&self) -> &str {
+        &self.0.name
+    }
+
+    /// Returns the expected types of the statement's parameters.
+    pub fn params(&self) -> &[Type] {
+        &self.0.params
+    }
+
+    /// Returns information about the columns returned when the statement is queried.
+    pub fn columns(&self) -> &[Column] {
+        &self.0.columns
+    }
+}
+
+/// Information about a column of a query.
+pub struct Column {
+    name: String,
+    type_: Type,
+
+    // raw fields from RowDescription
+    table_oid: Oid,
+    column_id: i16,
+    format: i16,
+
+    // that better be stored in self.type_, but that is more radical refactoring
+    type_oid: Oid,
+    type_size: i16,
+    type_modifier: i32,
+}
+
+impl Column {
+    pub(crate) fn new(name: String, type_: Type, raw_field: Field<'_>) -> Column {
+        Column {
+            name,
+            type_,
+            table_oid: raw_field.table_oid(),
+            column_id: raw_field.column_id(),
+            format: raw_field.format(),
+            type_oid: raw_field.type_oid(),
+            type_size: raw_field.type_size(),
+            type_modifier: raw_field.type_modifier(),
+        }
+    }
+
+    /// Returns the name of the column.
+    pub fn name(&self) -> &str {
+        &self.name
+    }
+
+    /// Returns the type of the column.
+    pub fn type_(&self) -> &Type {
+        &self.type_
+    }
+
+    /// Returns the table OID of the column.
+    pub fn table_oid(&self) -> Oid {
+        self.table_oid
+    }
+
+    /// Returns the column ID of the column.
+    pub fn column_id(&self) -> i16 {
+        self.column_id
+    }
+
+    /// Returns the format of the column.
+    pub fn format(&self) -> i16 {
+        self.format
+    }
+
+    /// Returns the type OID of the column.
+    pub fn type_oid(&self) -> Oid {
+        self.type_oid
+    }
+
+    /// Returns the type size of the column.
+    pub fn type_size(&self) -> i16 {
+        self.type_size
+    }
+
+    /// Returns the type modifier of the column.
+    pub fn type_modifier(&self) -> i32 {
+        self.type_modifier
+    }
+}
+
+impl fmt::Debug for Column {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt.debug_struct("Column")
+            .field("name", &self.name)
+            .field("type", &self.type_)
+            .finish()
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/tls.rs b/libs/proxy/tokio-postgres2/src/tls.rs
new file mode 100644
index 0000000000..dc8140719f
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/tls.rs
@@ -0,0 +1,162 @@
+//! TLS support.
+
+use std::error::Error;
+use std::future::Future;
+use std::pin::Pin;
+use std::task::{Context, Poll};
+use std::{fmt, io};
+use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
+
+pub(crate) mod private {
+    pub struct ForcePrivateApi;
+}
+
+/// Channel binding information returned from a TLS handshake.
+pub struct ChannelBinding {
+    pub(crate) tls_server_end_point: Option<Vec<u8>>,
+}
+
+impl ChannelBinding {
+    /// Creates a `ChannelBinding` containing no information.
+    pub fn none() -> ChannelBinding {
+        ChannelBinding {
+            tls_server_end_point: None,
+        }
+    }
+
+    /// Creates a `ChannelBinding` containing `tls-server-end-point` channel binding information.
+    pub fn tls_server_end_point(tls_server_end_point: Vec<u8>) -> ChannelBinding {
+        ChannelBinding {
+            tls_server_end_point: Some(tls_server_end_point),
+        }
+    }
+}
+
+/// A constructor of `TlsConnect`ors.
+///
+/// Requires the `runtime` Cargo feature (enabled by default).
+pub trait MakeTlsConnect<S> {
+    /// The stream type created by the `TlsConnect` implementation.
+    type Stream: TlsStream + Unpin;
+    /// The `TlsConnect` implementation created by this type.
+    type TlsConnect: TlsConnect<S, Stream = Self::Stream>;
+    /// The error type returned by the `TlsConnect` implementation.
+    type Error: Into<Box<dyn Error + Sync + Send>>;
+
+    /// Creates a new `TlsConnect`or.
+    ///
+    /// The domain name is provided for certificate verification and SNI.
+    fn make_tls_connect(&mut self, domain: &str) -> Result<Self::TlsConnect, Self::Error>;
+}
+
+/// An asynchronous function wrapping a stream in a TLS session.
+pub trait TlsConnect<S> {
+    /// The stream returned by the future.
+    type Stream: TlsStream + Unpin;
+    /// The error returned by the future.
+    type Error: Into<Box<dyn Error + Sync + Send>>;
+    /// The future returned by the connector.
+    type Future: Future<Output = Result<Self::Stream, Self::Error>>;
+
+    /// Returns a future performing a TLS handshake over the stream.
+    fn connect(self, stream: S) -> Self::Future;
+
+    #[doc(hidden)]
+    fn can_connect(&self, _: private::ForcePrivateApi) -> bool {
+        true
+    }
+}
+
+/// A TLS-wrapped connection to a PostgreSQL database.
+pub trait TlsStream: AsyncRead + AsyncWrite {
+    /// Returns channel binding information for the session.
+    fn channel_binding(&self) -> ChannelBinding;
+}
+
+/// A `MakeTlsConnect` and `TlsConnect` implementation which simply returns an error.
+///
+/// This can be used when `sslmode` is `none` or `prefer`.
+#[derive(Debug, Copy, Clone)]
+pub struct NoTls;
+
+impl<S> MakeTlsConnect<S> for NoTls {
+    type Stream = NoTlsStream;
+    type TlsConnect = NoTls;
+    type Error = NoTlsError;
+
+    fn make_tls_connect(&mut self, _: &str) -> Result<NoTls, NoTlsError> {
+        Ok(NoTls)
+    }
+}
+
+impl<S> TlsConnect<S> for NoTls {
+    type Stream = NoTlsStream;
+    type Error = NoTlsError;
+    type Future = NoTlsFuture;
+
+    fn connect(self, _: S) -> NoTlsFuture {
+        NoTlsFuture(())
+    }
+
+    fn can_connect(&self, _: private::ForcePrivateApi) -> bool {
+        false
+    }
+}
+
+/// The future returned by `NoTls`.
+pub struct NoTlsFuture(());
+
+impl Future for NoTlsFuture {
+    type Output = Result<NoTlsStream, NoTlsError>;
+
+    fn poll(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll<Self::Output> {
+        Poll::Ready(Err(NoTlsError(())))
+    }
+}
+
+/// The TLS "stream" type produced by the `NoTls` connector.
+///
+/// Since `NoTls` doesn't support TLS, this type is uninhabited.
+pub enum NoTlsStream {}
+
+impl AsyncRead for NoTlsStream {
+    fn poll_read(
+        self: Pin<&mut Self>,
+        _: &mut Context<'_>,
+        _: &mut ReadBuf<'_>,
+    ) -> Poll<io::Result<()>> {
+        match *self {}
+    }
+}
+
+impl AsyncWrite for NoTlsStream {
+    fn poll_write(self: Pin<&mut Self>, _: &mut Context<'_>, _: &[u8]) -> Poll<io::Result<usize>> {
+        match *self {}
+    }
+
+    fn poll_flush(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll<io::Result<()>> {
+        match *self {}
+    }
+
+    fn poll_shutdown(self: Pin<&mut Self>, _: &mut Context<'_>) -> Poll<io::Result<()>> {
+        match *self {}
+    }
+}
+
+impl TlsStream for NoTlsStream {
+    fn channel_binding(&self) -> ChannelBinding {
+        match *self {}
+    }
+}
+
+/// The error returned by `NoTls`.
+#[derive(Debug)]
+pub struct NoTlsError(());
+
+impl fmt::Display for NoTlsError {
+    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt.write_str("no TLS implementation configured")
+    }
+}
+
+impl Error for NoTlsError {}
diff --git a/libs/proxy/tokio-postgres2/src/to_statement.rs b/libs/proxy/tokio-postgres2/src/to_statement.rs
new file mode 100644
index 0000000000..427f77dd79
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/to_statement.rs
@@ -0,0 +1,57 @@
+use crate::to_statement::private::{Sealed, ToStatementType};
+use crate::Statement;
+
+mod private {
+    use crate::{Client, Error, Statement};
+
+    pub trait Sealed {}
+
+    pub enum ToStatementType<'a> {
+        Statement(&'a Statement),
+        Query(&'a str),
+    }
+
+    impl<'a> ToStatementType<'a> {
+        pub async fn into_statement(self, client: &Client) -> Result<Statement, Error> {
+            match self {
+                ToStatementType::Statement(s) => Ok(s.clone()),
+                ToStatementType::Query(s) => client.prepare(s).await,
+            }
+        }
+    }
+}
+
+/// A trait abstracting over prepared and unprepared statements.
+///
+/// Many methods are generic over this bound, so that they support both a raw query string as well as a statement which
+/// was prepared previously.
+///
+/// This trait is "sealed" and cannot be implemented by anything outside this crate.
+pub trait ToStatement: Sealed {
+    #[doc(hidden)]
+    fn __convert(&self) -> ToStatementType<'_>;
+}
+
+impl ToStatement for Statement {
+    fn __convert(&self) -> ToStatementType<'_> {
+        ToStatementType::Statement(self)
+    }
+}
+
+impl Sealed for Statement {}
+
+impl ToStatement for str {
+    fn __convert(&self) -> ToStatementType<'_> {
+        ToStatementType::Query(self)
+    }
+}
+
+impl Sealed for str {}
+
+impl ToStatement for String {
+    fn __convert(&self) -> ToStatementType<'_> {
+        ToStatementType::Query(self)
+    }
+}
+
+impl Sealed for String {}
diff --git a/libs/proxy/tokio-postgres2/src/transaction.rs b/libs/proxy/tokio-postgres2/src/transaction.rs
new file mode 100644
index 0000000000..03a57e4947
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/transaction.rs
@@ -0,0 +1,74 @@
+use crate::codec::FrontendMessage;
+use crate::connection::RequestMessages;
+use crate::query::RowStream;
+use crate::{CancelToken, Client, Error, ReadyForQueryStatus};
+use postgres_protocol2::message::frontend;
+
+/// A representation of a PostgreSQL database transaction.
+///
+/// Transactions will implicitly roll back when dropped. Use the `commit` method to commit the changes made in the
+/// transaction. Transactions can be nested, with inner transactions implemented via safepoints.
+pub struct Transaction<'a> {
+    client: &'a mut Client,
+    done: bool,
+}
+
+impl Drop for Transaction<'_> {
+    fn drop(&mut self) {
+        if self.done {
+            return;
+        }
+
+        let buf = self.client.inner().with_buf(|buf| {
+            frontend::query("ROLLBACK", buf).unwrap();
+            buf.split().freeze()
+        });
+        let _ = self
+            .client
+            .inner()
+            .send(RequestMessages::Single(FrontendMessage::Raw(buf)));
+    }
+}
+
+impl<'a> Transaction<'a> {
+    pub(crate) fn new(client: &'a mut Client) -> Transaction<'a> {
+        Transaction {
+            client,
+            done: false,
+        }
+    }
+
+    /// Consumes the transaction, committing all changes made within it.
+    pub async fn commit(mut self) -> Result<ReadyForQueryStatus, Error> {
+        self.done = true;
+        self.client.batch_execute("COMMIT").await
+    }
+
+    /// Rolls the transaction back, discarding all changes made within it.
+    ///
+    /// This is equivalent to `Transaction`'s `Drop` implementation, but provides any error encountered to the caller.
+    pub async fn rollback(mut self) -> Result<ReadyForQueryStatus, Error> {
+        self.done = true;
+        self.client.batch_execute("ROLLBACK").await
+    }
+
+    /// Like `Client::query_raw_txt`.
+    pub async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
+    where
+        S: AsRef<str>,
+        I: IntoIterator<Item = Option<S>>,
+        I::IntoIter: ExactSizeIterator,
+    {
+        self.client.query_raw_txt(statement, params).await
+    }
+
+    /// Like `Client::cancel_token`.
+    pub fn cancel_token(&self) -> CancelToken {
+        self.client.cancel_token()
+    }
+
+    /// Returns a reference to the underlying `Client`.
+    pub fn client(&self) -> &Client {
+        self.client
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/transaction_builder.rs b/libs/proxy/tokio-postgres2/src/transaction_builder.rs
new file mode 100644
index 0000000000..9718ac588c
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/transaction_builder.rs
@@ -0,0 +1,113 @@
+use crate::{Client, Error, Transaction};
+
+/// The isolation level of a database transaction.
+#[derive(Debug, Copy, Clone)]
+#[non_exhaustive]
+pub enum IsolationLevel {
+    /// Equivalent to `ReadCommitted`.
+    ReadUncommitted,
+
+    /// An individual statement in the transaction will see rows committed before it began.
+    ReadCommitted,
+
+    /// All statements in the transaction will see the same view of rows committed before the first query in the
+    /// transaction.
+    RepeatableRead,
+
+    /// The reads and writes in this transaction must be able to be committed as an atomic "unit" with respect to reads
+    /// and writes of all other concurrent serializable transactions without interleaving.
+    Serializable,
+}
+
+/// A builder for database transactions.
+pub struct TransactionBuilder<'a> {
+    client: &'a mut Client,
+    isolation_level: Option<IsolationLevel>,
+    read_only: Option<bool>,
+    deferrable: Option<bool>,
+}
+
+impl<'a> TransactionBuilder<'a> {
+    pub(crate) fn new(client: &'a mut Client) -> TransactionBuilder<'a> {
+        TransactionBuilder {
+            client,
+            isolation_level: None,
+            read_only: None,
+            deferrable: None,
+        }
+    }
+
+    /// Sets the isolation level of the transaction.
+    pub fn isolation_level(mut self, isolation_level: IsolationLevel) -> Self {
+        self.isolation_level = Some(isolation_level);
+        self
+    }
+
+    /// Sets the access mode of the transaction.
+    pub fn read_only(mut self, read_only: bool) -> Self {
+        self.read_only = Some(read_only);
+        self
+    }
+
+    /// Sets the deferrability of the transaction.
+    ///
+    /// If the transaction is also serializable and read only, creation of the transaction may block, but when it
+    /// completes the transaction is able to run with less overhead and a guarantee that it will not be aborted due to
+    /// serialization failure.
+    pub fn deferrable(mut self, deferrable: bool) -> Self {
+        self.deferrable = Some(deferrable);
+        self
+    }
+
+    /// Begins the transaction.
+    ///
+    /// The transaction will roll back by default - use the `commit` method to commit it.
+    pub async fn start(self) -> Result<Transaction<'a>, Error> {
+        let mut query = "START TRANSACTION".to_string();
+        let mut first = true;
+
+        if let Some(level) = self.isolation_level {
+            first = false;
+
+            query.push_str(" ISOLATION LEVEL ");
+            let level = match level {
+                IsolationLevel::ReadUncommitted => "READ UNCOMMITTED",
+                IsolationLevel::ReadCommitted => "READ COMMITTED",
+                IsolationLevel::RepeatableRead => "REPEATABLE READ",
+                IsolationLevel::Serializable => "SERIALIZABLE",
+            };
+            query.push_str(level);
+        }
+
+        if let Some(read_only) = self.read_only {
+            if !first {
+                query.push(',');
+            }
+            first = false;
+
+            let s = if read_only {
+                " READ ONLY"
+            } else {
+                " READ WRITE"
+            };
+            query.push_str(s);
+        }
+
+        if let Some(deferrable) = self.deferrable {
+            if !first {
+                query.push(',');
+            }
+
+            let s = if deferrable {
+                " DEFERRABLE"
+            } else {
+                " NOT DEFERRABLE"
+            };
+            query.push_str(s);
+        }
+
+        self.client.batch_execute(&query).await?;
+
+        Ok(Transaction::new(self.client))
+    }
+}
diff --git a/libs/proxy/tokio-postgres2/src/types.rs b/libs/proxy/tokio-postgres2/src/types.rs
new file mode 100644
index 0000000000..e571d7ee00
--- /dev/null
+++ b/libs/proxy/tokio-postgres2/src/types.rs
@@ -0,0 +1,6 @@
+//! Types.
+//!
+//! This module is a reexport of the `postgres_types` crate.
+
+#[doc(inline)]
+pub use postgres_types2::*;
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 1665d6361a..0d774d529d 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -55,6 +55,7 @@ parquet.workspace = true
 parquet_derive.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
+postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" }
 pq_proto.workspace = true
 prometheus.workspace = true
 rand.workspace = true
@@ -80,8 +81,7 @@ subtle.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
-tokio-postgres = { workspace = true, features = ["with-serde_json-1"] }
-tokio-postgres-rustls.workspace = true
+tokio-postgres = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" }
 tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
@@ -96,7 +96,6 @@ utils.workspace = true
 uuid.workspace = true
 rustls-native-certs.workspace = true
 x509-parser.workspace = true
-postgres-protocol.workspace = true
 redis.workspace = true
 zerocopy.workspace = true
 
@@ -117,6 +116,5 @@ tokio-tungstenite.workspace = true
 pbkdf2 = { workspace = true, features = ["simple", "std"] }
 rcgen.workspace = true
 rstest.workspace = true
-tokio-postgres-rustls.workspace = true
 walkdir.workspace = true
 rand_distr = "0.4"
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 8408d4720b..2abe88ac88 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -13,7 +13,6 @@ use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::tls::MakeTlsConnect;
-use tokio_postgres_rustls::MakeRustlsConnect;
 use tracing::{debug, error, info, warn};
 
 use crate::auth::parse_endpoint_param;
@@ -24,6 +23,7 @@ use crate::control_plane::errors::WakeComputeError;
 use crate::control_plane::messages::MetricsAuxInfo;
 use crate::error::{ReportableError, UserFacingError};
 use crate::metrics::{Metrics, NumDbConnectionsGuard};
+use crate::postgres_rustls::MakeRustlsConnect;
 use crate::proxy::neon_option;
 use crate::types::Host;
 
@@ -244,7 +244,6 @@ impl ConnCfg {
             let port = ports.get(i).or_else(|| ports.first()).unwrap_or(&5432);
             let host = match host {
                 Host::Tcp(host) => host.as_str(),
-                Host::Unix(_) => continue, // unix sockets are not welcome here
             };
 
             match connect_once(host, *port).await {
@@ -315,7 +314,7 @@ impl ConnCfg {
         };
         let client_config = client_config.with_no_client_auth();
 
-        let mut mk_tls = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
+        let mut mk_tls = crate::postgres_rustls::MakeRustlsConnect::new(client_config);
         let tls = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::make_tls_connect(
             &mut mk_tls,
             host,
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index 5c19a23e36..4a063a5faa 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -414,6 +414,7 @@ impl RequestContextInner {
                     outcome,
                 });
         }
+
         if let Some(tx) = self.sender.take() {
             // If type changes, this error handling needs to be updated.
             let tx: mpsc::UnboundedSender<RequestData> = tx;
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index ad7e1d2771..ba69f9cf2d 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -88,6 +88,7 @@ pub mod jemalloc;
 pub mod logging;
 pub mod metrics;
 pub mod parse;
+pub mod postgres_rustls;
 pub mod protocol2;
 pub mod proxy;
 pub mod rate_limiter;
diff --git a/proxy/src/postgres_rustls/mod.rs b/proxy/src/postgres_rustls/mod.rs
new file mode 100644
index 0000000000..31e7915e89
--- /dev/null
+++ b/proxy/src/postgres_rustls/mod.rs
@@ -0,0 +1,158 @@
+use std::convert::TryFrom;
+use std::sync::Arc;
+
+use rustls::pki_types::ServerName;
+use rustls::ClientConfig;
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio_postgres::tls::MakeTlsConnect;
+
+mod private {
+    use std::future::Future;
+    use std::io;
+    use std::pin::Pin;
+    use std::task::{Context, Poll};
+
+    use rustls::pki_types::ServerName;
+    use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
+    use tokio_postgres::tls::{ChannelBinding, TlsConnect};
+    use tokio_rustls::client::TlsStream;
+    use tokio_rustls::TlsConnector;
+
+    use crate::config::TlsServerEndPoint;
+
+    pub struct TlsConnectFuture<S> {
+        inner: tokio_rustls::Connect<S>,
+    }
+
+    impl<S> Future for TlsConnectFuture<S>
+    where
+        S: AsyncRead + AsyncWrite + Unpin,
+    {
+        type Output = io::Result<RustlsStream<S>>;
+
+        fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+            Pin::new(&mut self.inner).poll(cx).map_ok(RustlsStream)
+        }
+    }
+
+    pub struct RustlsConnect(pub RustlsConnectData);
+
+    pub struct RustlsConnectData {
+        pub hostname: ServerName<'static>,
+        pub connector: TlsConnector,
+    }
+
+    impl<S> TlsConnect<S> for RustlsConnect
+    where
+        S: AsyncRead + AsyncWrite + Unpin + Send + 'static,
+    {
+        type Stream = RustlsStream<S>;
+        type Error = io::Error;
+        type Future = TlsConnectFuture<S>;
+
+        fn connect(self, stream: S) -> Self::Future {
+            TlsConnectFuture {
+                inner: self.0.connector.connect(self.0.hostname, stream),
+            }
+        }
+    }
+
+    pub struct RustlsStream<S>(TlsStream<S>);
+
+    impl<S> tokio_postgres::tls::TlsStream for RustlsStream<S>
+    where
+        S: AsyncRead + AsyncWrite + Unpin,
+    {
+        fn channel_binding(&self) -> ChannelBinding {
+            let (_, session) = self.0.get_ref();
+            match session.peer_certificates() {
+                Some([cert, ..]) => TlsServerEndPoint::new(cert)
+                    .ok()
+                    .and_then(|cb| match cb {
+                        TlsServerEndPoint::Sha256(hash) => Some(hash),
+                        TlsServerEndPoint::Undefined => None,
+                    })
+                    .map_or_else(ChannelBinding::none, |hash| {
+                        ChannelBinding::tls_server_end_point(hash.to_vec())
+                    }),
+                _ => ChannelBinding::none(),
+            }
+        }
+    }
+
+    impl<S> AsyncRead for RustlsStream<S>
+    where
+        S: AsyncRead + AsyncWrite + Unpin,
+    {
+        fn poll_read(
+            mut self: Pin<&mut Self>,
+            cx: &mut Context<'_>,
+            buf: &mut ReadBuf<'_>,
+        ) -> Poll<tokio::io::Result<()>> {
+            Pin::new(&mut self.0).poll_read(cx, buf)
+        }
+    }
+
+    impl<S> AsyncWrite for RustlsStream<S>
+    where
+        S: AsyncRead + AsyncWrite + Unpin,
+    {
+        fn poll_write(
+            mut self: Pin<&mut Self>,
+            cx: &mut Context<'_>,
+            buf: &[u8],
+        ) -> Poll<tokio::io::Result<usize>> {
+            Pin::new(&mut self.0).poll_write(cx, buf)
+        }
+
+        fn poll_flush(
+            mut self: Pin<&mut Self>,
+            cx: &mut Context<'_>,
+        ) -> Poll<tokio::io::Result<()>> {
+            Pin::new(&mut self.0).poll_flush(cx)
+        }
+
+        fn poll_shutdown(
+            mut self: Pin<&mut Self>,
+            cx: &mut Context<'_>,
+        ) -> Poll<tokio::io::Result<()>> {
+            Pin::new(&mut self.0).poll_shutdown(cx)
+        }
+    }
+}
+
+/// A `MakeTlsConnect` implementation using `rustls`.
+///
+/// That way you can connect to PostgreSQL using `rustls` as the TLS stack.
+#[derive(Clone)]
+pub struct MakeRustlsConnect {
+    config: Arc<ClientConfig>,
+}
+
+impl MakeRustlsConnect {
+    /// Creates a new `MakeRustlsConnect` from the provided `ClientConfig`.
+    #[must_use]
+    pub fn new(config: ClientConfig) -> Self {
+        Self {
+            config: Arc::new(config),
+        }
+    }
+}
+
+impl<S> MakeTlsConnect<S> for MakeRustlsConnect
+where
+    S: AsyncRead + AsyncWrite + Unpin + Send + 'static,
+{
+    type Stream = private::RustlsStream<S>;
+    type TlsConnect = private::RustlsConnect;
+    type Error = rustls::pki_types::InvalidDnsNameError;
+
+    fn make_tls_connect(&mut self, hostname: &str) -> Result<Self::TlsConnect, Self::Error> {
+        ServerName::try_from(hostname).map(|dns_name| {
+            private::RustlsConnect(private::RustlsConnectData {
+                hostname: dns_name.to_owned(),
+                connector: Arc::clone(&self.config).into(),
+            })
+        })
+    }
+}
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 3de8ca8736..2c2c2964b6 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -14,7 +14,6 @@ use rustls::pki_types;
 use tokio::io::DuplexStream;
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::{MakeTlsConnect, NoTls};
-use tokio_postgres_rustls::MakeRustlsConnect;
 
 use super::connect_compute::ConnectMechanism;
 use super::retry::CouldRetry;
@@ -29,6 +28,7 @@ use crate::control_plane::{
     self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, NodeInfoCache,
 };
 use crate::error::ErrorKind;
+use crate::postgres_rustls::MakeRustlsConnect;
 use crate::types::{BranchId, EndpointId, ProjectId};
 use crate::{sasl, scram};
 
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 3037e20888..75909f3358 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -333,7 +333,7 @@ impl PoolingBackend {
             debug!("setting up backend session state");
 
             // initiates the auth session
-            if let Err(e) = client.query("select auth.init()", &[]).await {
+            if let Err(e) = client.execute("select auth.init()", &[]).await {
                 discard.discard();
                 return Err(e.into());
             }
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index bd262f45ed..c302eac568 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -6,9 +6,10 @@ use std::task::{ready, Poll};
 use futures::future::poll_fn;
 use futures::Future;
 use smallvec::SmallVec;
+use tokio::net::TcpStream;
 use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
-use tokio_postgres::{AsyncMessage, Socket};
+use tokio_postgres::AsyncMessage;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
 #[cfg(test)]
@@ -57,7 +58,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     ctx: &RequestContext,
     conn_info: ConnInfo,
     client: C,
-    mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
+    mut connection: tokio_postgres::Connection<TcpStream, NoTlsStream>,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
 ) -> Client<C> {
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 9abe35db08..db9ac49dae 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -24,10 +24,11 @@ use p256::ecdsa::{Signature, SigningKey};
 use parking_lot::RwLock;
 use serde_json::value::RawValue;
 use signature::Signer;
+use tokio::net::TcpStream;
 use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::types::ToSql;
-use tokio_postgres::{AsyncMessage, Socket};
+use tokio_postgres::AsyncMessage;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, info_span, warn, Instrument};
 
@@ -163,7 +164,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     ctx: &RequestContext,
     conn_info: ConnInfo,
     client: C,
-    mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
+    mut connection: tokio_postgres::Connection<TcpStream, NoTlsStream>,
     key: SigningKey,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
@@ -286,11 +287,11 @@ impl ClientInnerCommon<tokio_postgres::Client> {
             let token = resign_jwt(&local_data.key, payload, local_data.jti)?;
 
             // initiates the auth session
-            self.inner.simple_query("discard all").await?;
+            self.inner.batch_execute("discard all").await?;
             self.inner
-                .query(
+                .execute(
                     "select auth.jwt_session_init($1)",
-                    &[&token as &(dyn ToSql + Sync)],
+                    &[&&*token as &(dyn ToSql + Sync)],
                 )
                 .await?;
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index a73d9d6352..c0a3abc377 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -60,7 +60,6 @@ num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { version = "53", default-features = false, features = ["zstd"] }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon", default-features = false, features = ["with-serde_json-1"] }
 prost = { version = "0.13", features = ["prost-derive"] }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
@@ -79,8 +78,7 @@ subtle = { version = "2" }
 sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
 tikv-jemalloc-sys = { version = "0.6", features = ["stats"] }
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
-tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon", features = ["with-serde_json-1"] }
+tokio = { version = "1", features = ["full", "test-util"] }
 tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
 tokio-stream = { version = "0.1", features = ["net"] }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }

From 008616cfe696b219d39a53baee396748e75f9477 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 29 Nov 2024 13:27:49 +0000
Subject: [PATCH 140/209] storage controller: use proper ScheduleContext when
 evacuating a node (#9908)

## Problem

When picking locations for a shard, we should use a ScheduleContext that
includes all the other shards in the tenant, so that we apply proper
anti-affinity between shards. If we don't do this, then it can lead to
unstable scheduling, where we place a shard somewhere that the optimizer
will then immediately move it away from.

We didn't always do this, because it was a bit awkward to accumulate the
context for a tenant rather than just walking tenants.

This was a TODO in `handle_node_availability_transition`:
```
                        // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
                        // for tenants without secondary locations: if they have a secondary location, then this
                        // schedule() call is just promoting an existing secondary)
```

This is a precursor to https://github.com/neondatabase/neon/issues/8264,
where the current imperfect scheduling during node evacuation hampers
testing.

## Summary of changes

- Add an iterator type that yields each shard along with a
schedulecontext that includes all the other shards from the same tenant
- Use the iterator to replace hand-crafted logic in optimize_all_plan
(functionally identical)
- Use the iterator in `handle_node_availability_transition` to apply
proper anti-affinity during node evacuation.
---
 storage_controller/src/scheduler.rs           |  17 +-
 storage_controller/src/service.rs             | 200 +++++++-----------
 .../src/service/context_iterator.rs           | 139 ++++++++++++
 storage_controller/src/tenant_shard.rs        |  11 +-
 4 files changed, 245 insertions(+), 122 deletions(-)
 create mode 100644 storage_controller/src/service/context_iterator.rs

diff --git a/storage_controller/src/scheduler.rs b/storage_controller/src/scheduler.rs
index 2414d95eb8..ecc6b11e47 100644
--- a/storage_controller/src/scheduler.rs
+++ b/storage_controller/src/scheduler.rs
@@ -305,7 +305,7 @@ impl std::ops::Add for AffinityScore {
 
 /// Hint for whether this is a sincere attempt to schedule, or a speculative
 /// check for where we _would_ schedule (done during optimization)
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub(crate) enum ScheduleMode {
     Normal,
     Speculative,
@@ -319,7 +319,7 @@ impl Default for ScheduleMode {
 
 // For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
 // it for many shards in the same tenant.
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Clone)]
 pub(crate) struct ScheduleContext {
     /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
     pub(crate) nodes: HashMap<NodeId, AffinityScore>,
@@ -331,6 +331,14 @@ pub(crate) struct ScheduleContext {
 }
 
 impl ScheduleContext {
+    pub(crate) fn new(mode: ScheduleMode) -> Self {
+        Self {
+            nodes: HashMap::new(),
+            attached_nodes: HashMap::new(),
+            mode,
+        }
+    }
+
     /// Input is a list of nodes we would like to avoid using again within this context.  The more
     /// times a node is passed into this call, the less inclined we are to use it.
     pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
@@ -355,6 +363,11 @@ impl ScheduleContext {
     pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
         self.attached_nodes.get(&node_id).copied().unwrap_or(0)
     }
+
+    #[cfg(test)]
+    pub(crate) fn attach_count(&self) -> usize {
+        self.attached_nodes.values().sum()
+    }
 }
 
 pub(crate) enum RefCountUpdate {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 446c476b99..636ccf11a1 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1,3 +1,6 @@
+pub mod chaos_injector;
+mod context_iterator;
+
 use hyper::Uri;
 use std::{
     borrow::Cow,
@@ -95,7 +98,7 @@ use crate::{
     },
 };
 
-pub mod chaos_injector;
+use context_iterator::TenantShardContextIterator;
 
 // For operations that should be quick, like attaching a new tenant
 const SHORT_RECONCILE_TIMEOUT: Duration = Duration::from_secs(5);
@@ -5498,49 +5501,51 @@ impl Service {
 
                 let mut tenants_affected: usize = 0;
 
-                for (tenant_shard_id, tenant_shard) in tenants {
-                    if let Some(observed_loc) = tenant_shard.observed.locations.get_mut(&node_id) {
-                        // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
-                        // not assume our knowledge of the node's configuration is accurate until it comes back online
-                        observed_loc.conf = None;
-                    }
+                for (_tenant_id, mut schedule_context, shards) in
+                    TenantShardContextIterator::new(tenants, ScheduleMode::Normal)
+                {
+                    for tenant_shard in shards {
+                        let tenant_shard_id = tenant_shard.tenant_shard_id;
+                        if let Some(observed_loc) =
+                            tenant_shard.observed.locations.get_mut(&node_id)
+                        {
+                            // When a node goes offline, we set its observed configuration to None, indicating unknown: we will
+                            // not assume our knowledge of the node's configuration is accurate until it comes back online
+                            observed_loc.conf = None;
+                        }
 
-                    if nodes.len() == 1 {
-                        // Special case for single-node cluster: there is no point trying to reschedule
-                        // any tenant shards: avoid doing so, in order to avoid spewing warnings about
-                        // failures to schedule them.
-                        continue;
-                    }
+                        if nodes.len() == 1 {
+                            // Special case for single-node cluster: there is no point trying to reschedule
+                            // any tenant shards: avoid doing so, in order to avoid spewing warnings about
+                            // failures to schedule them.
+                            continue;
+                        }
 
-                    if !nodes
-                        .values()
-                        .any(|n| matches!(n.may_schedule(), MaySchedule::Yes(_)))
-                    {
-                        // Special case for when all nodes are unavailable and/or unschedulable: there is no point
-                        // trying to reschedule since there's nowhere else to go. Without this
-                        // branch we incorrectly detach tenants in response to node unavailability.
-                        continue;
-                    }
+                        if !nodes
+                            .values()
+                            .any(|n| matches!(n.may_schedule(), MaySchedule::Yes(_)))
+                        {
+                            // Special case for when all nodes are unavailable and/or unschedulable: there is no point
+                            // trying to reschedule since there's nowhere else to go. Without this
+                            // branch we incorrectly detach tenants in response to node unavailability.
+                            continue;
+                        }
 
-                    if tenant_shard.intent.demote_attached(scheduler, node_id) {
-                        tenant_shard.sequence = tenant_shard.sequence.next();
+                        if tenant_shard.intent.demote_attached(scheduler, node_id) {
+                            tenant_shard.sequence = tenant_shard.sequence.next();
 
-                        // TODO: populate a ScheduleContext including all shards in the same tenant_id (only matters
-                        // for tenants without secondary locations: if they have a secondary location, then this
-                        // schedule() call is just promoting an existing secondary)
-                        let mut schedule_context = ScheduleContext::default();
-
-                        match tenant_shard.schedule(scheduler, &mut schedule_context) {
-                            Err(e) => {
-                                // It is possible that some tenants will become unschedulable when too many pageservers
-                                // go offline: in this case there isn't much we can do other than make the issue observable.
-                                // TODO: give TenantShard a scheduling error attribute to be queried later.
-                                tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id);
-                            }
-                            Ok(()) => {
-                                if self.maybe_reconcile_shard(tenant_shard, nodes).is_some() {
-                                    tenants_affected += 1;
-                                };
+                            match tenant_shard.schedule(scheduler, &mut schedule_context) {
+                                Err(e) => {
+                                    // It is possible that some tenants will become unschedulable when too many pageservers
+                                    // go offline: in this case there isn't much we can do other than make the issue observable.
+                                    // TODO: give TenantShard a scheduling error attribute to be queried later.
+                                    tracing::warn!(%tenant_shard_id, "Scheduling error when marking pageserver {} offline: {e}", node_id);
+                                }
+                                Ok(()) => {
+                                    if self.maybe_reconcile_shard(tenant_shard, nodes).is_some() {
+                                        tenants_affected += 1;
+                                    };
+                                }
                             }
                         }
                     }
@@ -6011,14 +6016,8 @@ impl Service {
         let (nodes, tenants, _scheduler) = locked.parts_mut();
         let pageservers = nodes.clone();
 
-        let mut schedule_context = ScheduleContext::default();
-
         let mut reconciles_spawned = 0;
-        for (tenant_shard_id, shard) in tenants.iter_mut() {
-            if tenant_shard_id.is_shard_zero() {
-                schedule_context = ScheduleContext::default();
-            }
-
+        for shard in tenants.values_mut() {
             // Skip checking if this shard is already enqueued for reconciliation
             if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 {
                 // If there is something delayed, then return a nonzero count so that
@@ -6033,8 +6032,6 @@ impl Service {
             if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
                 reconciles_spawned += 1;
             }
-
-            schedule_context.avoid(&shard.intent.all_pageservers());
         }
 
         reconciles_spawned
@@ -6103,95 +6100,62 @@ impl Service {
     }
 
     fn optimize_all_plan(&self) -> Vec<(TenantShardId, ScheduleOptimization)> {
-        let mut schedule_context = ScheduleContext::default();
-
-        let mut tenant_shards: Vec<&TenantShard> = Vec::new();
-
         // How many candidate optimizations we will generate, before evaluating them for readniess: setting
         // this higher than the execution limit gives us a chance to execute some work even if the first
         // few optimizations we find are not ready.
         const MAX_OPTIMIZATIONS_PLAN_PER_PASS: usize = 8;
 
         let mut work = Vec::new();
-
         let mut locked = self.inner.write().unwrap();
         let (nodes, tenants, scheduler) = locked.parts_mut();
-        for (tenant_shard_id, shard) in tenants.iter() {
-            if tenant_shard_id.is_shard_zero() {
-                // Reset accumulators on the first shard in a tenant
-                schedule_context = ScheduleContext::default();
-                schedule_context.mode = ScheduleMode::Speculative;
-                tenant_shards.clear();
-            }
 
-            if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS {
-                break;
-            }
-
-            match shard.get_scheduling_policy() {
-                ShardSchedulingPolicy::Active => {
-                    // Ok to do optimization
+        for (_tenant_id, schedule_context, shards) in
+            TenantShardContextIterator::new(tenants, ScheduleMode::Speculative)
+        {
+            for shard in shards {
+                if work.len() >= MAX_OPTIMIZATIONS_PLAN_PER_PASS {
+                    break;
                 }
-                ShardSchedulingPolicy::Essential
-                | ShardSchedulingPolicy::Pause
-                | ShardSchedulingPolicy::Stop => {
-                    // Policy prevents optimizing this shard.
-                    continue;
+                match shard.get_scheduling_policy() {
+                    ShardSchedulingPolicy::Active => {
+                        // Ok to do optimization
+                    }
+                    ShardSchedulingPolicy::Essential
+                    | ShardSchedulingPolicy::Pause
+                    | ShardSchedulingPolicy::Stop => {
+                        // Policy prevents optimizing this shard.
+                        continue;
+                    }
                 }
-            }
 
-            // Accumulate the schedule context for all the shards in a tenant: we must have
-            // the total view of all shards before we can try to optimize any of them.
-            schedule_context.avoid(&shard.intent.all_pageservers());
-            if let Some(attached) = shard.intent.get_attached() {
-                schedule_context.push_attached(*attached);
-            }
-            tenant_shards.push(shard);
-
-            // Once we have seen the last shard in the tenant, proceed to search across all shards
-            // in the tenant for optimizations
-            if shard.shard.number.0 == shard.shard.count.count() - 1 {
-                if tenant_shards.iter().any(|s| s.reconciler.is_some()) {
+                if !matches!(shard.splitting, SplitState::Idle)
+                    || matches!(shard.policy, PlacementPolicy::Detached)
+                    || shard.reconciler.is_some()
+                {
                     // Do not start any optimizations while another change to the tenant is ongoing: this
                     // is not necessary for correctness, but simplifies operations and implicitly throttles
                     // optimization changes to happen in a "trickle" over time.
                     continue;
                 }
 
-                if tenant_shards.iter().any(|s| {
-                    !matches!(s.splitting, SplitState::Idle)
-                        || matches!(s.policy, PlacementPolicy::Detached)
-                }) {
-                    // Never attempt to optimize a tenant that is currently being split, or
-                    // a tenant that is meant to be detached
-                    continue;
-                }
-
                 // TODO: optimization calculations are relatively expensive: create some fast-path for
                 // the common idle case (avoiding the search on tenants that we have recently checked)
-
-                for shard in &tenant_shards {
-                    if let Some(optimization) =
-                        // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
-                        // its primary location based on soft constraints, cut it over.
-                        shard.optimize_attachment(nodes, &schedule_context)
-                    {
-                        work.push((shard.tenant_shard_id, optimization));
-                        break;
-                    } else if let Some(optimization) =
-                        // If idle, maybe optimize secondary locations: if a shard has a secondary location that would be
-                        // better placed on another node, based on ScheduleContext, then adjust it.  This
-                        // covers cases like after a shard split, where we might have too many shards
-                        // in the same tenant with secondary locations on the node where they originally split.
-                        shard.optimize_secondary(scheduler, &schedule_context)
-                    {
-                        work.push((shard.tenant_shard_id, optimization));
-                        break;
-                    }
-
-                    // TODO: extend this mechanism to prefer attaching on nodes with fewer attached
-                    // tenants (i.e. extend schedule state to distinguish attached from secondary counts),
-                    // for the total number of attachments on a node (not just within a tenant.)
+                if let Some(optimization) =
+                    // If idle, maybe ptimize attachments: if a shard has a secondary location that is preferable to
+                    // its primary location based on soft constraints, cut it over.
+                    shard.optimize_attachment(nodes, &schedule_context)
+                {
+                    work.push((shard.tenant_shard_id, optimization));
+                    break;
+                } else if let Some(optimization) =
+                    // If idle, maybe optimize secondary locations: if a shard has a secondary location that would be
+                    // better placed on another node, based on ScheduleContext, then adjust it.  This
+                    // covers cases like after a shard split, where we might have too many shards
+                    // in the same tenant with secondary locations on the node where they originally split.
+                    shard.optimize_secondary(scheduler, &schedule_context)
+                {
+                    work.push((shard.tenant_shard_id, optimization));
+                    break;
                 }
             }
         }
diff --git a/storage_controller/src/service/context_iterator.rs b/storage_controller/src/service/context_iterator.rs
new file mode 100644
index 0000000000..d38010a27e
--- /dev/null
+++ b/storage_controller/src/service/context_iterator.rs
@@ -0,0 +1,139 @@
+use std::collections::BTreeMap;
+
+use utils::id::TenantId;
+use utils::shard::TenantShardId;
+
+use crate::scheduler::{ScheduleContext, ScheduleMode};
+use crate::tenant_shard::TenantShard;
+
+/// When making scheduling decisions, it is useful to have the ScheduleContext for a whole
+/// tenant while considering the individual shards within it.  This iterator is a helper
+/// that gathers all the shards in a tenant and then yields them together with a ScheduleContext
+/// for the tenant.
+pub(super) struct TenantShardContextIterator<'a> {
+    schedule_mode: ScheduleMode,
+    inner: std::collections::btree_map::IterMut<'a, TenantShardId, TenantShard>,
+}
+
+impl<'a> TenantShardContextIterator<'a> {
+    pub(super) fn new(
+        tenants: &'a mut BTreeMap<TenantShardId, TenantShard>,
+        schedule_mode: ScheduleMode,
+    ) -> Self {
+        Self {
+            schedule_mode,
+            inner: tenants.iter_mut(),
+        }
+    }
+}
+
+impl<'a> Iterator for TenantShardContextIterator<'a> {
+    type Item = (TenantId, ScheduleContext, Vec<&'a mut TenantShard>);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let mut tenant_shards = Vec::new();
+        let mut schedule_context = ScheduleContext::new(self.schedule_mode.clone());
+        loop {
+            let (tenant_shard_id, shard) = self.inner.next()?;
+
+            if tenant_shard_id.is_shard_zero() {
+                // Cleared on last shard of previous tenant
+                assert!(tenant_shards.is_empty());
+            }
+
+            // Accumulate the schedule context for all the shards in a tenant
+            schedule_context.avoid(&shard.intent.all_pageservers());
+            if let Some(attached) = shard.intent.get_attached() {
+                schedule_context.push_attached(*attached);
+            }
+            tenant_shards.push(shard);
+
+            if tenant_shard_id.shard_number.0 == tenant_shard_id.shard_count.count() - 1 {
+                return Some((tenant_shard_id.tenant_id, schedule_context, tenant_shards));
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::{collections::BTreeMap, str::FromStr};
+
+    use pageserver_api::controller_api::PlacementPolicy;
+    use utils::shard::{ShardCount, ShardNumber};
+
+    use crate::{
+        scheduler::test_utils::make_test_nodes, service::Scheduler,
+        tenant_shard::tests::make_test_tenant_with_id,
+    };
+
+    use super::*;
+
+    #[test]
+    fn test_context_iterator() {
+        // Hand-crafted tenant IDs to ensure they appear in the expected order when put into
+        // a btreemap & iterated
+        let mut t_1_shards = make_test_tenant_with_id(
+            TenantId::from_str("af0480929707ee75372337efaa5ecf96").unwrap(),
+            PlacementPolicy::Attached(1),
+            ShardCount(1),
+            None,
+        );
+        let t_2_shards = make_test_tenant_with_id(
+            TenantId::from_str("bf0480929707ee75372337efaa5ecf96").unwrap(),
+            PlacementPolicy::Attached(1),
+            ShardCount(4),
+            None,
+        );
+        let mut t_3_shards = make_test_tenant_with_id(
+            TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap(),
+            PlacementPolicy::Attached(1),
+            ShardCount(1),
+            None,
+        );
+
+        let t1_id = t_1_shards[0].tenant_shard_id.tenant_id;
+        let t2_id = t_2_shards[0].tenant_shard_id.tenant_id;
+        let t3_id = t_3_shards[0].tenant_shard_id.tenant_id;
+
+        let mut tenants = BTreeMap::new();
+        tenants.insert(t_1_shards[0].tenant_shard_id, t_1_shards.pop().unwrap());
+        for shard in t_2_shards {
+            tenants.insert(shard.tenant_shard_id, shard);
+        }
+        tenants.insert(t_3_shards[0].tenant_shard_id, t_3_shards.pop().unwrap());
+
+        let nodes = make_test_nodes(3, &[]);
+        let mut scheduler = Scheduler::new(nodes.values());
+        let mut context = ScheduleContext::default();
+        for shard in tenants.values_mut() {
+            shard.schedule(&mut scheduler, &mut context).unwrap();
+        }
+
+        let mut iter = TenantShardContextIterator::new(&mut tenants, ScheduleMode::Speculative);
+        let (tenant_id, context, shards) = iter.next().unwrap();
+        assert_eq!(tenant_id, t1_id);
+        assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
+        assert_eq!(shards.len(), 1);
+        assert_eq!(context.attach_count(), 1);
+
+        let (tenant_id, context, shards) = iter.next().unwrap();
+        assert_eq!(tenant_id, t2_id);
+        assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
+        assert_eq!(shards[1].tenant_shard_id.shard_number, ShardNumber(1));
+        assert_eq!(shards[2].tenant_shard_id.shard_number, ShardNumber(2));
+        assert_eq!(shards[3].tenant_shard_id.shard_number, ShardNumber(3));
+        assert_eq!(shards.len(), 4);
+        assert_eq!(context.attach_count(), 4);
+
+        let (tenant_id, context, shards) = iter.next().unwrap();
+        assert_eq!(tenant_id, t3_id);
+        assert_eq!(shards[0].tenant_shard_id.shard_number, ShardNumber(0));
+        assert_eq!(shards.len(), 1);
+        assert_eq!(context.attach_count(), 1);
+
+        for shard in tenants.values_mut() {
+            shard.intent.clear(&mut scheduler);
+        }
+    }
+}
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 27c97d3b86..2eb98ee825 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1574,13 +1574,20 @@ pub(crate) mod tests {
         )
     }
 
-    fn make_test_tenant(
+    pub(crate) fn make_test_tenant(
         policy: PlacementPolicy,
         shard_count: ShardCount,
         preferred_az: Option<AvailabilityZone>,
     ) -> Vec<TenantShard> {
-        let tenant_id = TenantId::generate();
+        make_test_tenant_with_id(TenantId::generate(), policy, shard_count, preferred_az)
+    }
 
+    pub(crate) fn make_test_tenant_with_id(
+        tenant_id: TenantId,
+        policy: PlacementPolicy,
+        shard_count: ShardCount,
+        preferred_az: Option<AvailabilityZone>,
+    ) -> Vec<TenantShard> {
         (0..shard_count.count())
             .map(|i| {
                 let shard_number = ShardNumber(i);

From 4bb9554e4a12005a22135f4a0f5fbd164f55875a Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 29 Nov 2024 15:38:04 +0200
Subject: [PATCH 141/209] safekeeper: use jemalloc (#9780)

## Problem

To add Safekeeper heap profiling in #9778, we need to switch to an
allocator that supports it. Pageserver and proxy already use jemalloc.

Touches #9534.

## Summary of changes

Use jemalloc in Safekeeper.
---
 Cargo.lock                        |  1 +
 safekeeper/Cargo.toml             |  1 +
 safekeeper/benches/receive_wal.rs | 30 +++++++++++++++++++++++++++++-
 safekeeper/src/bin/safekeeper.rs  |  3 +++
 4 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/Cargo.lock b/Cargo.lock
index f05c6311dd..abe69525c9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5409,6 +5409,7 @@ dependencies = [
  "strum",
  "strum_macros",
  "thiserror",
+ "tikv-jemallocator",
  "tokio",
  "tokio-io-timeout",
  "tokio-postgres",
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 635a9222e1..0422c46ab1 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -41,6 +41,7 @@ serde_json.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 thiserror.workspace = true
+tikv-jemallocator.workspace = true
 tokio = { workspace = true, features = ["fs"] }
 tokio-util = { workspace = true }
 tokio-io-timeout.workspace = true
diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs
index c637b4fb24..8c4281cf52 100644
--- a/safekeeper/benches/receive_wal.rs
+++ b/safekeeper/benches/receive_wal.rs
@@ -6,6 +6,7 @@ mod benchutils;
 use std::io::Write as _;
 
 use benchutils::Env;
+use bytes::BytesMut;
 use camino_tempfile::tempfile;
 use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion};
 use itertools::Itertools as _;
@@ -23,6 +24,9 @@ const KB: usize = 1024;
 const MB: usize = 1024 * KB;
 const GB: usize = 1024 * MB;
 
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
 // Register benchmarks with Criterion.
 criterion_group!(
     name = benches;
@@ -30,7 +34,8 @@ criterion_group!(
     targets = bench_process_msg,
     bench_wal_acceptor,
     bench_wal_acceptor_throughput,
-    bench_file_write
+    bench_file_write,
+    bench_bytes_reserve,
 );
 criterion_main!(benches);
 
@@ -341,3 +346,26 @@ fn bench_file_write(c: &mut Criterion) {
         Ok(())
     }
 }
+
+/// Benchmarks the cost of memory allocations when receiving WAL messages. This emulates the logic
+/// in FeMessage::parse, which extends the read buffer. It is primarily intended to test jemalloc.
+fn bench_bytes_reserve(c: &mut Criterion) {
+    let mut g = c.benchmark_group("bytes_reserve");
+    for size in [1, 64, KB, 8 * KB, 128 * KB] {
+        g.throughput(criterion::Throughput::Bytes(size as u64));
+        g.bench_function(format!("size={size}"), |b| run_bench(b, size).unwrap());
+    }
+
+    fn run_bench(b: &mut Bencher, size: usize) -> anyhow::Result<()> {
+        let mut bytes = BytesMut::new();
+        let data = vec![0; size];
+
+        b.iter(|| {
+            bytes.reserve(size);
+            bytes.extend_from_slice(&data);
+            bytes.split_to(size).freeze();
+        });
+
+        Ok(())
+    }
+}
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 1248428d33..3659bcd7e0 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -48,6 +48,9 @@ use utils::{
     tcp_listener,
 };
 
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
 const PID_FILE_NAME: &str = "safekeeper.pid";
 const ID_FILE_NAME: &str = "safekeeper.id";
 

From 0a550f3e7dcec29cc5b162665ef5ff16daf5eaf5 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 29 Nov 2024 14:55:56 +0100
Subject: [PATCH 142/209] feat(compute_ctl): Always set application_name
 (#9934)

## Problem

It was not always possible to judge what exactly some `cloud_admin`
connections were doing because we didn't consistently set
`application_name` everywhere.

## Summary of changes

Unify the way we connect to Postgres:
1. Switch to building configs everywhere
2. Always set `application_name` and make naming consistent

Follow-up for #9919
Part of neondatabase/cloud#20948
---
 compute_tools/src/bin/compute_ctl.rs      | 10 ++++-
 compute_tools/src/catalog.rs              |  7 +---
 compute_tools/src/checker.rs              |  3 +-
 compute_tools/src/compute.rs              | 49 ++++++++++++++++-------
 compute_tools/src/http/api.rs             | 11 +++--
 compute_tools/src/installed_extensions.rs | 14 +++----
 compute_tools/src/monitor.rs              | 13 +++---
 7 files changed, 64 insertions(+), 43 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 6b670de2ea..b178d7abd6 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -37,6 +37,7 @@ use std::collections::HashMap;
 use std::fs::File;
 use std::path::Path;
 use std::process::exit;
+use std::str::FromStr;
 use std::sync::atomic::Ordering;
 use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
 use std::{thread, time::Duration};
@@ -322,8 +323,15 @@ fn wait_spec(
     } else {
         spec_set = false;
     }
+    let connstr = Url::parse(connstr).context("cannot parse connstr as a URL")?;
+    let conn_conf = postgres::config::Config::from_str(connstr.as_str())
+        .context("cannot build postgres config from connstr")?;
+    let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
+        .context("cannot build tokio postgres config from connstr")?;
     let compute_node = ComputeNode {
-        connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?,
+        connstr,
+        conn_conf,
+        tokio_conn_conf,
         pgdata: pgdata.to_string(),
         pgbin: pgbin.to_string(),
         pgversion: get_pg_version_string(pgbin),
diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs
index 08ae8bf44d..72198a9479 100644
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -6,7 +6,6 @@ use tokio::{
     process::Command,
     spawn,
 };
-use tokio_postgres::connect;
 use tokio_stream::{self as stream, StreamExt};
 use tokio_util::codec::{BytesCodec, FramedRead};
 use tracing::warn;
@@ -16,10 +15,8 @@ use crate::pg_helpers::{get_existing_dbs_async, get_existing_roles_async, postgr
 use compute_api::responses::CatalogObjects;
 
 pub async fn get_dbs_and_roles(compute: &Arc<ComputeNode>) -> anyhow::Result<CatalogObjects> {
-    let connstr = compute.connstr.clone();
-
-    let (client, connection): (tokio_postgres::Client, _) =
-        connect(connstr.as_str(), NoTls).await?;
+    let conf = compute.get_tokio_conn_conf(Some("compute_ctl:get_dbs_and_roles"));
+    let (client, connection): (tokio_postgres::Client, _) = conf.connect(NoTls).await?;
 
     spawn(async move {
         if let Err(e) = connection.await {
diff --git a/compute_tools/src/checker.rs b/compute_tools/src/checker.rs
index cec2b1bed8..62d61a8bc9 100644
--- a/compute_tools/src/checker.rs
+++ b/compute_tools/src/checker.rs
@@ -9,7 +9,8 @@ use crate::compute::ComputeNode;
 #[instrument(skip_all)]
 pub async fn check_writability(compute: &ComputeNode) -> Result<()> {
     // Connect to the database.
-    let (client, connection) = tokio_postgres::connect(compute.connstr.as_str(), NoTls).await?;
+    let conf = compute.get_tokio_conn_conf(Some("compute_ctl:availability_checker"));
+    let (client, connection) = conf.connect(NoTls).await?;
     if client.is_closed() {
         return Err(anyhow!("connection to postgres closed"));
     }
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 1a026a4014..da1caf1a9b 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -20,8 +20,9 @@ use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use nix::unistd::Pid;
+use postgres;
 use postgres::error::SqlState;
-use postgres::{Client, NoTls};
+use postgres::NoTls;
 use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
@@ -58,6 +59,10 @@ pub static PG_PID: AtomicU32 = AtomicU32::new(0);
 pub struct ComputeNode {
     // Url type maintains proper escaping
     pub connstr: url::Url,
+    // We connect to Postgres from many different places, so build configs once
+    // and reuse them where needed.
+    pub conn_conf: postgres::config::Config,
+    pub tokio_conn_conf: tokio_postgres::config::Config,
     pub pgdata: String,
     pub pgbin: String,
     pub pgversion: String,
@@ -800,10 +805,10 @@ impl ComputeNode {
     /// version. In the future, it may upgrade all 3rd-party extensions.
     #[instrument(skip_all)]
     pub fn post_apply_config(&self) -> Result<()> {
-        let connstr = self.connstr.clone();
+        let conf = self.get_conn_conf(Some("compute_ctl:post_apply_config"));
         thread::spawn(move || {
             let func = || {
-                let mut client = Client::connect(connstr.as_str(), NoTls)?;
+                let mut client = conf.connect(NoTls)?;
                 handle_neon_extension_upgrade(&mut client)
                     .context("handle_neon_extension_upgrade")?;
                 Ok::<_, anyhow::Error>(())
@@ -815,12 +820,27 @@ impl ComputeNode {
         Ok(())
     }
 
+    pub fn get_conn_conf(&self, application_name: Option<&str>) -> postgres::Config {
+        let mut conf = self.conn_conf.clone();
+        if let Some(application_name) = application_name {
+            conf.application_name(application_name);
+        }
+        conf
+    }
+
+    pub fn get_tokio_conn_conf(&self, application_name: Option<&str>) -> tokio_postgres::Config {
+        let mut conf = self.tokio_conn_conf.clone();
+        if let Some(application_name) = application_name {
+            conf.application_name(application_name);
+        }
+        conf
+    }
+
     async fn get_maintenance_client(
         conf: &tokio_postgres::Config,
     ) -> Result<tokio_postgres::Client> {
         let mut conf = conf.clone();
-
-        conf.application_name("apply_config");
+        conf.application_name("compute_ctl:apply_config");
 
         let (client, conn) = match conf.connect(NoTls).await {
             // If connection fails, it may be the old node with `zenith_admin` superuser.
@@ -837,6 +857,7 @@ impl ComputeNode {
                         e
                     );
                     let mut zenith_admin_conf = postgres::config::Config::from(conf.clone());
+                    zenith_admin_conf.application_name("compute_ctl:apply_config");
                     zenith_admin_conf.user("zenith_admin");
 
                     let mut client =
@@ -1134,8 +1155,7 @@ impl ComputeNode {
     /// Do initial configuration of the already started Postgres.
     #[instrument(skip_all)]
     pub fn apply_config(&self, compute_state: &ComputeState) -> Result<()> {
-        let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
-        conf.application_name("apply_config");
+        let conf = self.get_tokio_conn_conf(Some("compute_ctl:apply_config"));
 
         let conf = Arc::new(conf);
         let spec = Arc::new(
@@ -1161,7 +1181,7 @@ impl ComputeNode {
         thread::spawn(move || {
             let conf = conf.as_ref().clone();
             let mut conf = postgres::config::Config::from(conf);
-            conf.application_name("migrations");
+            conf.application_name("compute_ctl:migrations");
 
             let mut client = conf.connect(NoTls)?;
             handle_migrations(&mut client).context("apply_config handle_migrations")
@@ -1369,9 +1389,9 @@ impl ComputeNode {
             }
             self.post_apply_config()?;
 
-            let connstr = self.connstr.clone();
+            let conf = self.get_conn_conf(None);
             thread::spawn(move || {
-                let res = get_installed_extensions(&connstr);
+                let res = get_installed_extensions(conf);
                 match res {
                     Ok(extensions) => {
                         info!(
@@ -1510,7 +1530,8 @@ impl ComputeNode {
     /// Select `pg_stat_statements` data and return it as a stringified JSON
     pub async fn collect_insights(&self) -> String {
         let mut result_rows: Vec<String> = Vec::new();
-        let connect_result = tokio_postgres::connect(self.connstr.as_str(), NoTls).await;
+        let conf = self.get_tokio_conn_conf(Some("compute_ctl:collect_insights"));
+        let connect_result = conf.connect(NoTls).await;
         let (client, connection) = connect_result.unwrap();
         tokio::spawn(async move {
             if let Err(e) = connection.await {
@@ -1636,10 +1657,9 @@ LIMIT 100",
         privileges: &[Privilege],
         role_name: &PgIdent,
     ) -> Result<()> {
-        use tokio_postgres::config::Config;
         use tokio_postgres::NoTls;
 
-        let mut conf = Config::from_str(self.connstr.as_str()).unwrap();
+        let mut conf = self.get_tokio_conn_conf(Some("compute_ctl:set_role_grants"));
         conf.dbname(db_name);
 
         let (db_client, conn) = conf
@@ -1676,10 +1696,9 @@ LIMIT 100",
         db_name: &PgIdent,
         ext_version: ExtVersion,
     ) -> Result<ExtVersion> {
-        use tokio_postgres::config::Config;
         use tokio_postgres::NoTls;
 
-        let mut conf = Config::from_str(self.connstr.as_str()).unwrap();
+        let mut conf = self.get_tokio_conn_conf(Some("compute_ctl:install_extension"));
         conf.dbname(db_name);
 
         let (db_client, conn) = conf
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index a6c6cff20a..7fa6426d8f 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -295,12 +295,11 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
                 return Response::new(Body::from(msg));
             }
 
-            let connstr = compute.connstr.clone();
-            let res = task::spawn_blocking(move || {
-                installed_extensions::get_installed_extensions(&connstr)
-            })
-            .await
-            .unwrap();
+            let conf = compute.get_conn_conf(None);
+            let res =
+                task::spawn_blocking(move || installed_extensions::get_installed_extensions(conf))
+                    .await
+                    .unwrap();
 
             match res {
                 Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index f473c29a55..5f62f08858 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -10,8 +10,6 @@ use metrics::core::Collector;
 use metrics::{register_uint_gauge_vec, UIntGaugeVec};
 use once_cell::sync::Lazy;
 
-use crate::pg_helpers::postgres_conf_for_db;
-
 /// We don't reuse get_existing_dbs() just for code clarity
 /// and to make database listing query here more explicit.
 ///
@@ -41,14 +39,16 @@ fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 ///
 /// Same extension can be installed in multiple databases with different versions,
 /// we only keep the highest and lowest version across all databases.
-pub fn get_installed_extensions(connstr: &url::Url) -> Result<InstalledExtensions> {
-    let mut client = Client::connect(connstr.as_str(), NoTls)?;
+pub fn get_installed_extensions(mut conf: postgres::config::Config) -> Result<InstalledExtensions> {
+    conf.application_name("compute_ctl:get_installed_extensions");
+    let mut client = conf.connect(NoTls)?;
+
     let databases: Vec<String> = list_dbs(&mut client)?;
 
     let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
     for db in databases.iter() {
-        let config = postgres_conf_for_db(connstr, db)?;
-        let mut db_client = config.connect(NoTls)?;
+        conf.dbname(db);
+        let mut db_client = conf.connect(NoTls)?;
         let extensions: Vec<(String, String)> = db_client
             .query(
                 "SELECT extname, extversion FROM pg_catalog.pg_extension;",
@@ -82,7 +82,7 @@ pub fn get_installed_extensions(connstr: &url::Url) -> Result<InstalledExtension
     }
 
     let res = InstalledExtensions {
-        extensions: extensions_map.values().cloned().collect(),
+        extensions: extensions_map.into_values().collect(),
     };
 
     Ok(res)
diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index d7127aac32..184f380a8d 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -17,11 +17,8 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
 // should be handled gracefully.
 fn watch_compute_activity(compute: &ComputeNode) {
     // Suppose that `connstr` doesn't change
-    let mut connstr = compute.connstr.clone();
-    connstr
-        .query_pairs_mut()
-        .append_pair("application_name", "compute_activity_monitor");
-    let connstr = connstr.as_str();
+    let connstr = compute.connstr.clone();
+    let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor"));
 
     // During startup and configuration we connect to every Postgres database,
     // but we don't want to count this as some user activity. So wait until
@@ -29,7 +26,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
     wait_for_postgres_start(compute);
 
     // Define `client` outside of the loop to reuse existing connection if it's active.
-    let mut client = Client::connect(connstr, NoTls);
+    let mut client = conf.connect(NoTls);
 
     let mut sleep = false;
     let mut prev_active_time: Option<f64> = None;
@@ -57,7 +54,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
                     info!("connection to Postgres is closed, trying to reconnect");
 
                     // Connection is closed, reconnect and try again.
-                    client = Client::connect(connstr, NoTls);
+                    client = conf.connect(NoTls);
                     continue;
                 }
 
@@ -196,7 +193,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
                 debug!("could not connect to Postgres: {}, retrying", e);
 
                 // Establish a new connection and try again.
-                client = Client::connect(connstr, NoTls);
+                client = conf.connect(NoTls);
             }
         }
     }

From 4f3649461512e5696a64f65972a68f5a115224e7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 29 Nov 2024 15:11:44 +0000
Subject: [PATCH 143/209] pageserver: download small objects using a smaller
 timeout (#9938)

## Problem

It appears that the Azure storage API tends to hang TCP connections more
than S3 does.

Currently we use a 2 minute timeout for all downloads. This is large
because sometimes the objects we download are large. However, waiting 2
minutes when doing something like downloading a manifest on tenant
attach is problematic, because when someone is doing a "create tenant,
create timeline" workflow, that 2 minutes is long enough for them
reasonably to give up creating that timeline.

Rather than propagate oversized timeouts further up the stack, we should
use a different timeout for objects that we expect to be small.

Closes: https://github.com/neondatabase/neon/issues/9836

## Summary of changes

- Add a `small_timeout` configuration attribute to remote storage,
defaulting to 30 seconds (still a very generous period to do something
like download an index)
- Add a DownloadKind parameter to DownloadOpts, so that callers can
indicate whether they expect the object to be small or large.
- In the azure client, use small timeout for HEAD requests, and for GET
requests if DownloadKind::Small is used.
- Use DownloadKind::Small for manifests, indices, and heatmap downloads.

This PR intentionally does not make the equivalent change to the S3
client, to reduce blast radius in case this has unexpected consequences
(we could accomplish the same thing by editing lots of configs, but just
skipping the code is simpler for right now)
---
 libs/remote_storage/src/azure_blob.rs         | 23 ++++++++++++++---
 libs/remote_storage/src/config.rs             | 25 ++++++++++++++++---
 libs/remote_storage/src/lib.rs                | 20 ++++++++++++++-
 libs/remote_storage/tests/test_real_azure.rs  |  3 ++-
 libs/remote_storage/tests/test_real_s3.rs     |  1 +
 pageserver/src/deletion_queue.rs              |  1 +
 pageserver/src/tenant.rs                      |  1 +
 .../tenant/remote_timeline_client/download.rs | 21 +++++++++++++---
 pageserver/src/tenant/secondary/downloader.rs |  3 ++-
 .../import_pgdata/importbucket_client.rs      |  4 ++-
 proxy/src/context/parquet.rs                  |  2 ++
 11 files changed, 89 insertions(+), 15 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index 840917ef68..8d1962fa29 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -35,6 +35,7 @@ use utils::backoff;
 use utils::backoff::exponential_backoff_duration_seconds;
 
 use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind};
+use crate::DownloadKind;
 use crate::{
     config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError,
     DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, RemoteStorage, StorageMetadata,
@@ -49,10 +50,17 @@ pub struct AzureBlobStorage {
     concurrency_limiter: ConcurrencyLimiter,
     // Per-request timeout. Accessible for tests.
     pub timeout: Duration,
+
+    // Alternative timeout used for metadata objects which are expected to be small
+    pub small_timeout: Duration,
 }
 
 impl AzureBlobStorage {
-    pub fn new(azure_config: &AzureConfig, timeout: Duration) -> Result<Self> {
+    pub fn new(
+        azure_config: &AzureConfig,
+        timeout: Duration,
+        small_timeout: Duration,
+    ) -> Result<Self> {
         debug!(
             "Creating azure remote storage for azure container {}",
             azure_config.container_name
@@ -94,6 +102,7 @@ impl AzureBlobStorage {
             max_keys_per_list_response,
             concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
             timeout,
+            small_timeout,
         })
     }
 
@@ -133,6 +142,7 @@ impl AzureBlobStorage {
     async fn download_for_builder(
         &self,
         builder: GetBlobBuilder,
+        timeout: Duration,
         cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
         let kind = RequestKind::Get;
@@ -156,7 +166,7 @@ impl AzureBlobStorage {
                 .map_err(to_download_error);
 
             // apply per request timeout
-            let response = tokio_stream::StreamExt::timeout(response, self.timeout);
+            let response = tokio_stream::StreamExt::timeout(response, timeout);
 
             // flatten
             let response = response.map(|res| match res {
@@ -415,7 +425,7 @@ impl RemoteStorage for AzureBlobStorage {
         let blob_client = self.client.blob_client(self.relative_path_to_name(key));
         let properties_future = blob_client.get_properties().into_future();
 
-        let properties_future = tokio::time::timeout(self.timeout, properties_future);
+        let properties_future = tokio::time::timeout(self.small_timeout, properties_future);
 
         let res = tokio::select! {
             res = properties_future => res,
@@ -521,7 +531,12 @@ impl RemoteStorage for AzureBlobStorage {
             });
         }
 
-        self.download_for_builder(builder, cancel).await
+        let timeout = match opts.kind {
+            DownloadKind::Small => self.small_timeout,
+            DownloadKind::Large => self.timeout,
+        };
+
+        self.download_for_builder(builder, timeout, cancel).await
     }
 
     async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index e99ae4f747..f6ef31077c 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -24,6 +24,13 @@ pub struct RemoteStorageConfig {
         skip_serializing_if = "is_default_timeout"
     )]
     pub timeout: Duration,
+    /// Alternative timeout used for metadata objects which are expected to be small
+    #[serde(
+        with = "humantime_serde",
+        default = "default_small_timeout",
+        skip_serializing_if = "is_default_small_timeout"
+    )]
+    pub small_timeout: Duration,
 }
 
 impl RemoteStorageKind {
@@ -40,10 +47,18 @@ fn default_timeout() -> Duration {
     RemoteStorageConfig::DEFAULT_TIMEOUT
 }
 
+fn default_small_timeout() -> Duration {
+    RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
+}
+
 fn is_default_timeout(d: &Duration) -> bool {
     *d == RemoteStorageConfig::DEFAULT_TIMEOUT
 }
 
+fn is_default_small_timeout(d: &Duration) -> bool {
+    *d == RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
+}
+
 /// A kind of a remote storage to connect to, with its connection configuration.
 #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)]
 #[serde(untagged)]
@@ -184,6 +199,7 @@ fn serialize_storage_class<S: serde::Serializer>(
 
 impl RemoteStorageConfig {
     pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
+    pub const DEFAULT_SMALL_TIMEOUT: Duration = std::time::Duration::from_secs(30);
 
     pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
         Ok(utils::toml_edit_ext::deserialize_item(toml)?)
@@ -219,7 +235,8 @@ timeout = '5s'";
                 storage: RemoteStorageKind::LocalFs {
                     local_path: Utf8PathBuf::from(".")
                 },
-                timeout: Duration::from_secs(5)
+                timeout: Duration::from_secs(5),
+                small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
             }
         );
     }
@@ -247,7 +264,8 @@ timeout = '5s'";
                     max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
                     upload_storage_class: Some(StorageClass::IntelligentTiering),
                 }),
-                timeout: Duration::from_secs(7)
+                timeout: Duration::from_secs(7),
+                small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
             }
         );
     }
@@ -299,7 +317,8 @@ timeout = '5s'";
                     concurrency_limit: default_remote_storage_azure_concurrency_limit(),
                     max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
                 }),
-                timeout: Duration::from_secs(7)
+                timeout: Duration::from_secs(7),
+                small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
             }
         );
     }
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 719608dd5f..0ece29d99e 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -178,6 +178,15 @@ pub struct DownloadOpts {
     /// The end of the byte range to download, or unbounded. Must be after the
     /// start bound.
     pub byte_end: Bound<u64>,
+    /// Indicate whether we're downloading something small or large: this indirectly controls
+    /// timeouts: for something like an index/manifest/heatmap, we should time out faster than
+    /// for layer files
+    pub kind: DownloadKind,
+}
+
+pub enum DownloadKind {
+    Large,
+    Small,
 }
 
 impl Default for DownloadOpts {
@@ -186,6 +195,7 @@ impl Default for DownloadOpts {
             etag: Default::default(),
             byte_start: Bound::Unbounded,
             byte_end: Bound::Unbounded,
+            kind: DownloadKind::Large,
         }
     }
 }
@@ -584,6 +594,10 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
 impl GenericRemoteStorage {
     pub async fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
         let timeout = storage_config.timeout;
+
+        // If somkeone overrides timeout to be small without adjusting small_timeout, then adjust it automatically
+        let small_timeout = std::cmp::min(storage_config.small_timeout, timeout);
+
         Ok(match &storage_config.storage {
             RemoteStorageKind::LocalFs { local_path: path } => {
                 info!("Using fs root '{path}' as a remote storage");
@@ -606,7 +620,11 @@ impl GenericRemoteStorage {
                     .unwrap_or("<AZURE_STORAGE_ACCOUNT>");
                 info!("Using azure container '{}' in account '{storage_account}' in region '{}' as a remote storage, prefix in container: '{:?}'",
                       azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
-                Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config, timeout)?))
+                Self::AzureBlob(Arc::new(AzureBlobStorage::new(
+                    azure_config,
+                    timeout,
+                    small_timeout,
+                )?))
             }
         })
     }
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 3a20649490..92d579fec8 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -219,7 +219,8 @@ async fn create_azure_client(
             concurrency_limit: NonZeroUsize::new(100).unwrap(),
             max_keys_per_list_response,
         }),
-        timeout: Duration::from_secs(120),
+        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+        small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
     };
     Ok(Arc::new(
         GenericRemoteStorage::from_config(&remote_storage_config)
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 3e99a65fac..e60ec18c93 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -396,6 +396,7 @@ async fn create_s3_client(
             upload_storage_class: None,
         }),
         timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+        small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
     };
     Ok(Arc::new(
         GenericRemoteStorage::from_config(&remote_storage_config)
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index e74c8ecf5a..1d508f5fe9 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -838,6 +838,7 @@ mod test {
                 local_path: remote_fs_dir.clone(),
             },
             timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+            small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
         };
         let storage = GenericRemoteStorage::from_config(&storage_config)
             .await
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 339a3ca1bb..cd0690bb1a 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5423,6 +5423,7 @@ pub(crate) mod harness {
                     local_path: remote_fs_dir.clone(),
                 },
                 timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+                small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
             };
             let remote_storage = GenericRemoteStorage::from_config(&config).await.unwrap();
             let deletion_queue = MockDeletionQueue::new(Some(remote_storage.clone()));
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index d632e595ad..739615be9c 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -30,7 +30,9 @@ use crate::tenant::Generation;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
-use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath};
+use remote_storage::{
+    DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath,
+};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::{TenantId, TimelineId};
 use utils::pausable_failpoint;
@@ -345,12 +347,13 @@ pub async fn list_remote_timelines(
 async fn do_download_remote_path_retry_forever(
     storage: &GenericRemoteStorage,
     remote_path: &RemotePath,
+    download_opts: DownloadOpts,
     cancel: &CancellationToken,
 ) -> Result<(Vec<u8>, SystemTime), DownloadError> {
     download_retry_forever(
         || async {
             let download = storage
-                .download(remote_path, &DownloadOpts::default(), cancel)
+                .download(remote_path, &download_opts, cancel)
                 .await?;
 
             let mut bytes = Vec::new();
@@ -377,8 +380,13 @@ async fn do_download_tenant_manifest(
 ) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> {
     let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
 
+    let download_opts = DownloadOpts {
+        kind: DownloadKind::Small,
+        ..Default::default()
+    };
+
     let (manifest_bytes, manifest_bytes_mtime) =
-        do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
+        do_download_remote_path_retry_forever(storage, &remote_path, download_opts, cancel).await?;
 
     let tenant_manifest = TenantManifest::from_json_bytes(&manifest_bytes)
         .with_context(|| format!("deserialize tenant manifest file at {remote_path:?}"))
@@ -398,8 +406,13 @@ async fn do_download_index_part(
         timeline_id.expect("A timeline ID is always provided when downloading an index");
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
 
+    let download_opts = DownloadOpts {
+        kind: DownloadKind::Small,
+        ..Default::default()
+    };
+
     let (index_part_bytes, index_part_mtime) =
-        do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
+        do_download_remote_path_retry_forever(storage, &remote_path, download_opts, cancel).await?;
 
     let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
         .with_context(|| format!("deserialize index part file at {remote_path:?}"))
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 7443261a9c..8d771dc405 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -49,7 +49,7 @@ use futures::Future;
 use metrics::UIntGauge;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
-use remote_storage::{DownloadError, DownloadOpts, Etag, GenericRemoteStorage};
+use remote_storage::{DownloadError, DownloadKind, DownloadOpts, Etag, GenericRemoteStorage};
 
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, warn, Instrument};
@@ -946,6 +946,7 @@ impl<'a> TenantDownloader<'a> {
         let cancel = &self.secondary_state.cancel;
         let opts = DownloadOpts {
             etag: prev_etag.cloned(),
+            kind: DownloadKind::Small,
             ..Default::default()
         };
 
diff --git a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
index 8d5ab1780f..bc4d148a29 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
@@ -4,7 +4,8 @@ use anyhow::Context;
 use bytes::Bytes;
 use postgres_ffi::ControlFileData;
 use remote_storage::{
-    Download, DownloadError, DownloadOpts, GenericRemoteStorage, Listing, ListingObject, RemotePath,
+    Download, DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, Listing,
+    ListingObject, RemotePath,
 };
 use serde::de::DeserializeOwned;
 use tokio_util::sync::CancellationToken;
@@ -239,6 +240,7 @@ impl RemoteStorageWrapper {
                     .download(
                         path,
                         &DownloadOpts {
+                            kind: DownloadKind::Large,
                             etag: None,
                             byte_start: Bound::Included(start_inclusive),
                             byte_end: Bound::Excluded(end_exclusive)
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index e328c6de79..b375eb886e 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -486,6 +486,7 @@ mod tests {
                     upload_storage_class: None,
                 }),
                 timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+                small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
             })
         );
         assert_eq!(parquet_upload.parquet_upload_row_group_size, 100);
@@ -545,6 +546,7 @@ mod tests {
                 local_path: tmpdir.to_path_buf(),
             },
             timeout: std::time::Duration::from_secs(120),
+            small_timeout: std::time::Duration::from_secs(30),
         };
         let storage = GenericRemoteStorage::from_config(&remote_storage_config)
             .await

From 93f29a0065ff820b2fa16f97a824d358655282f3 Mon Sep 17 00:00:00 2001
From: Gleb Novikov <NanoBjorn@users.noreply.github.com>
Date: Fri, 29 Nov 2024 17:58:36 +0000
Subject: [PATCH 144/209] Fixed fast_import pgbin in calling get_pg_version
 (#9933)

Was working on https://github.com/neondatabase/cloud/pull/20795 and
discovered that fast_import is not working normally.
---
 compute_tools/src/bin/fast_import.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index 6716cc6234..b6db3eb11a 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -21,7 +21,7 @@
 //! - Build the image with the following command:
 //!
 //! ```bash
-//! docker buildx build --build-arg DEBIAN_FLAVOR=bullseye-slim --build-arg GIT_VERSION=local --build-arg PG_VERSION=v14 --build-arg BUILD_TAG="$(date --iso-8601=s -u)"  -t localhost:3030/localregistry/compute-node-v14:latest -f compute/Dockerfile.com
+//! docker buildx build --platform linux/amd64 --build-arg DEBIAN_VERSION=bullseye --build-arg GIT_VERSION=local --build-arg PG_VERSION=v14 --build-arg BUILD_TAG="$(date --iso-8601=s -u)" -t localhost:3030/localregistry/compute-node-v14:latest -f compute/compute-node.Dockerfile .
 //! docker push localhost:3030/localregistry/compute-node-v14:latest
 //! ```
 
@@ -132,7 +132,8 @@ pub(crate) async fn main() -> anyhow::Result<()> {
     //
     //  Initialize pgdata
     //
-    let pg_version = match get_pg_version(pg_bin_dir.as_str()) {
+    let pgbin = pg_bin_dir.join("postgres");
+    let pg_version = match get_pg_version(pgbin.as_ref()) {
         PostgresMajorVersion::V14 => 14,
         PostgresMajorVersion::V15 => 15,
         PostgresMajorVersion::V16 => 16,
@@ -155,7 +156,7 @@ pub(crate) async fn main() -> anyhow::Result<()> {
     //
     // Launch postgres process
     //
-    let mut postgres_proc = tokio::process::Command::new(pg_bin_dir.join("postgres"))
+    let mut postgres_proc = tokio::process::Command::new(pgbin)
         .arg("-D")
         .arg(&pgdata_dir)
         .args(["-c", "wal_level=minimal"])

From 547b2d28278ccb69bb6d782e771790151bc62651 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Fri, 29 Nov 2024 20:10:26 +0100
Subject: [PATCH 145/209] Fix timeout value used in XLogWaitForReplayOf (#9937)

The previous value assumed usec precision, while the timeout used is in
milliseconds, causing replica backends to wait for (potentially) many
hours for WAL replay without the expected progress reports in logs.

This fixes the issue.

Reported-By: Alexander Lakhin <exclusion@gmail.com>

## Problem


https://github.com/neondatabase/postgres/pull/279#issuecomment-2507671817

The timeout value was configured with the assumption the indicated value
would be microseconds, where it's actually milliseconds. That causes the
backend to wait for much longer (2h46m40s) before it emits the "I'm
waiting for recovery" message. While we do have wait events configured
on this, it's not great to have stuck backends without clear logs, so
this fixes the timeout value in all our PostgreSQL branches.

## PG PRs

* PG14: https://github.com/neondatabase/postgres/pull/542
* PG15: https://github.com/neondatabase/postgres/pull/543
* PG16: https://github.com/neondatabase/postgres/pull/544
* PG17: https://github.com/neondatabase/postgres/pull/545
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 284ae56be2..c1989c934d 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 284ae56be2397fd3eaf20777fa220b2d0ad968f5
+Subproject commit c1989c934d46e04e78b3c496c8a34bcd40ddceeb
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index aed79ee87b..d929b9a8b9 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit aed79ee87b94779cc52ec13e3b74eba6ada93f05
+Subproject commit d929b9a8b9f32f6fe5a0eac3e6e963f0e44e27e6
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index f5cfc6fa89..13e9e35394 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit f5cfc6fa898544050e821ac688adafece1ac3cff
+Subproject commit 13e9e3539419003e79bd9aa29e1bc44f3fd555dd
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 3c15b6565f..faebe5e5af 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f
+Subproject commit faebe5e5aff5687908504453623778f8515529db
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 4dae88e73d..abeddcadf7 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.2",
-    "3c15b6565f6c8d36d169ed9ea7412cf90cfb2a8f"
+    "faebe5e5aff5687908504453623778f8515529db"
   ],
   "v16": [
     "16.6",
-    "f5cfc6fa898544050e821ac688adafece1ac3cff"
+    "13e9e3539419003e79bd9aa29e1bc44f3fd555dd"
   ],
   "v15": [
     "15.10",
-    "aed79ee87b94779cc52ec13e3b74eba6ada93f05"
+    "d929b9a8b9f32f6fe5a0eac3e6e963f0e44e27e6"
   ],
   "v14": [
     "14.15",
-    "284ae56be2397fd3eaf20777fa220b2d0ad968f5"
+    "c1989c934d46e04e78b3c496c8a34bcd40ddceeb"
   ]
 }

From 5dad89acd4f7cc8231e939cc62cb6b5e048ea67e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sat, 30 Nov 2024 01:16:24 +0100
Subject: [PATCH 146/209] page_service: rewrite batching to work without a
 timeout (#9851)

# Problem

The timeout-based batching adds latency to unbatchable workloads.

We can choose a short batching timeout (e.g. 10us) but that requires
high-resolution timers, which tokio doesn't have.
I thoroughly explored options to use OS timers (see
[this](https://github.com/neondatabase/neon/pull/9822) abandoned PR).
In short, it's not an attractive option because any timer implementation
adds non-trivial overheads.

# Solution

The insight is that, in the steady state of a batchable workload, the
time we spend in `get_vectored` will be hundreds of microseconds anyway.

If we prepare the next batch concurrently to `get_vectored`, we will
have a sizeable batch ready once `get_vectored` of the current batch is
done and do not need an explicit timeout.

This can be reasonably described as **pipelining of the protocol
handler**.

# Implementation

We model the sub-protocol handler for pagestream requests
(`handle_pagrequests`) as two futures that form a pipeline:

2. Batching: read requests from the connection and fill the current
batch
3. Execution: `take` the current batch, execute it using `get_vectored`,
and send the response.

The Reading and Batching stage are connected through a new type of
channel called `spsc_fold`.

See the long comment in the `handle_pagerequests_pipelined` for details.

# Changes

- Refactor `handle_pagerequests`
    - separate functions for
- reading one protocol message; produces a `BatchedFeMessage` with just
one page request in it
- batching; tried to merge an incoming `BatchedFeMessage` into an
existing `BatchedFeMessage`; returns `None` on success and returns back
the incoming message in case merging isn't possible
        - execution of a batched message
- unify the timeline handle acquisition & request span construction; it
now happen in the function that reads the protocol message
- Implement serial and pipelined model
    - serial: what we had before any of the batching changes
      - read one protocol message
      - execute protocol messages
    - pipelined: the design described above
- optionality for execution of the pipeline: either via concurrent
futures vs tokio tasks
- Pageserver config
  - remove batching timeout field
  - add ability to configure pipelining mode
- add ability to limit max batch size for pipelined configurations
(required for the rollout, cf
https://github.com/neondatabase/cloud/issues/20620 )
  - ability to configure execution mode
- Tests
  - remove `batch_timeout` parametrization
  - rename `test_getpage_merge_smoke` to `test_throughput`
- add parametrization to test different max batch sizes and execution
moes
  - rename `test_timer_precision` to `test_latency`
  - rename the test case file to `test_page_service_batching.py`
  - better descriptions of what the tests actually do

## On the holding The `TimelineHandle` in the pending batch

While batching, we hold the `TimelineHandle` in the pending batch.
Therefore, the timeline will not finish shutting down while we're
batching.

This is not a problem in practice because the concurrently ongoing
`get_vectored` call will fail quickly with an error indicating that the
timeline is shutting down.
This results in the Execution stage returning a `QueryError::Shutdown`,
which causes the pipeline / entire page service connection to shut down.
This drops all references to the
`Arc<Mutex<Option<Box<BatchedFeMessage>>>>` object, thereby dropping the
contained `TimelineHandle`s.

- => fixes https://github.com/neondatabase/neon/issues/9850

# Performance

Local run of the benchmarks, results in [this empty
commit](https://github.com/neondatabase/neon/pull/9851/commits/1cf5b1463f69ba5066cbb0713912aec7bb5579ad)
in the PR branch.

Key take-aways:
* `concurrent-futures` and `tasks` deliver identical `batching_factor`
* tail latency impact unknown, cf
https://github.com/neondatabase/neon/issues/9837
* `concurrent-futures` has higher throughput than `tasks` in all
workloads (=lower `time` metric)
* In unbatchable workloads, `concurrent-futures` has 5% higher
`CPU-per-throughput` than that of `tasks`, and 15% higher than that of
`serial`.
* In batchable-32 workload, `concurrent-futures` has 8% lower
`CPU-per-throughput` than that of `tasks` (comparison to tput of
`serial` is irrelevant)
* in unbatchable workloads, mean and tail latencies of
`concurrent-futures` is practically identical to `serial`, whereas
`tasks` adds 20-30us of overhead

Overall, `concurrent-futures` seems like a slightly more attractive
choice.

# Rollout

This change is disabled-by-default.

Rollout plan:
- https://github.com/neondatabase/cloud/issues/20620

# Refs

- epic: https://github.com/neondatabase/neon/issues/9376
- this sub-task: https://github.com/neondatabase/neon/issues/9377
- the abandoned attempt to improve batching timeout resolution:
https://github.com/neondatabase/neon/pull/9820
- closes https://github.com/neondatabase/neon/issues/9850
- fixes https://github.com/neondatabase/neon/issues/9835
---
 Cargo.lock                                    |   10 +-
 Cargo.toml                                    |    1 +
 libs/pageserver_api/src/config.rs             |   30 +-
 libs/utils/Cargo.toml                         |    2 +
 libs/utils/src/sync.rs                        |    2 +
 libs/utils/src/sync/spsc_fold.rs              |  452 +++++++
 pageserver/src/config.rs                      |   10 +-
 pageserver/src/lib.rs                         |   19 +
 pageserver/src/page_service.rs                | 1059 ++++++++++-------
 test_runner/fixtures/neon_fixtures.py         |    3 +-
 ...merge.py => test_page_service_batching.py} |  131 +-
 11 files changed, 1262 insertions(+), 457 deletions(-)
 create mode 100644 libs/utils/src/sync/spsc_fold.rs
 rename test_runner/performance/pageserver/{test_pageserver_getpage_merge.py => test_page_service_batching.py} (69%)

diff --git a/Cargo.lock b/Cargo.lock
index abe69525c9..313222cf3c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1,6 +1,6 @@
 # This file is automatically @generated by Cargo.
 # It is not intended for manual editing.
-version = 3
+version = 4
 
 [[package]]
 name = "RustyXML"
@@ -1717,6 +1717,12 @@ dependencies = [
  "utils",
 ]
 
+[[package]]
+name = "diatomic-waker"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab03c107fafeb3ee9f5925686dbb7a73bc76e3932abb0d2b365cb64b169cf04c"
+
 [[package]]
 name = "diesel"
 version = "2.2.3"
@@ -7045,6 +7051,7 @@ dependencies = [
  "chrono",
  "const_format",
  "criterion",
+ "diatomic-waker",
  "fail",
  "futures",
  "git-version",
@@ -7063,6 +7070,7 @@ dependencies = [
  "rand 0.8.5",
  "regex",
  "routerify",
+ "scopeguard",
  "sentry",
  "serde",
  "serde_assert",
diff --git a/Cargo.toml b/Cargo.toml
index 742201d0f5..64c384f17a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -83,6 +83,7 @@ comfy-table = "7.1"
 const_format = "0.2"
 crc32c = "0.6"
 dashmap = { version = "5.5.0", features = ["raw-api"] }
+diatomic-waker = { version = "0.2.3" }
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 721d97404b..e49d15ba87 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -118,9 +118,8 @@ pub struct ConfigToml {
     pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub no_sync: Option<bool>,
-    #[serde(with = "humantime_serde")]
-    pub server_side_batch_timeout: Option<Duration>,
     pub wal_receiver_protocol: PostgresClientProtocol,
+    pub page_service_pipelining: PageServicePipeliningConfig,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -137,6 +136,28 @@ pub struct DiskUsageEvictionTaskConfig {
     pub eviction_order: EvictionOrder,
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(tag = "mode", rename_all = "kebab-case")]
+#[serde(deny_unknown_fields)]
+pub enum PageServicePipeliningConfig {
+    Serial,
+    Pipelined(PageServicePipeliningConfigPipelined),
+}
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(deny_unknown_fields)]
+pub struct PageServicePipeliningConfigPipelined {
+    /// Causes runtime errors if larger than max get_vectored batch size.
+    pub max_batch_size: NonZeroUsize,
+    pub execution: PageServiceProtocolPipelinedExecutionStrategy,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum PageServiceProtocolPipelinedExecutionStrategy {
+    ConcurrentFutures,
+    Tasks,
+}
+
 pub mod statvfs {
     pub mod mock {
         #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -332,8 +353,6 @@ pub mod defaults {
 
     pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
 
-    pub const DEFAULT_SERVER_SIDE_BATCH_TIMEOUT: Option<&str> = None;
-
     pub const DEFAULT_WAL_RECEIVER_PROTOCOL: utils::postgres_client::PostgresClientProtocol =
         utils::postgres_client::PostgresClientProtocol::Vanilla;
 }
@@ -420,11 +439,10 @@ impl Default for ConfigToml {
             ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: None,
             virtual_file_io_mode: None,
-            server_side_batch_timeout: DEFAULT_SERVER_SIDE_BATCH_TIMEOUT
-                .map(|duration| humantime::parse_duration(duration).unwrap()),
             tenant_config: TenantConfigToml::default(),
             no_sync: None,
             wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
+            page_service_pipelining: PageServicePipeliningConfig::Serial,
         }
     }
 }
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index f440b81d8f..5648072a83 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -19,6 +19,7 @@ bincode.workspace = true
 bytes.workspace = true
 camino.workspace = true
 chrono.workspace = true
+diatomic-waker.workspace = true
 git-version.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
@@ -45,6 +46,7 @@ tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
 rand.workspace = true
+scopeguard.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
diff --git a/libs/utils/src/sync.rs b/libs/utils/src/sync.rs
index 2ee8f35449..7aa26e24bc 100644
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -1,3 +1,5 @@
 pub mod heavier_once_cell;
 
 pub mod gate;
+
+pub mod spsc_fold;
diff --git a/libs/utils/src/sync/spsc_fold.rs b/libs/utils/src/sync/spsc_fold.rs
new file mode 100644
index 0000000000..b44f766ef0
--- /dev/null
+++ b/libs/utils/src/sync/spsc_fold.rs
@@ -0,0 +1,452 @@
+use core::{future::poll_fn, task::Poll};
+use std::sync::{Arc, Mutex};
+
+use diatomic_waker::DiatomicWaker;
+
+pub struct Sender<T> {
+    state: Arc<Inner<T>>,
+}
+
+pub struct Receiver<T> {
+    state: Arc<Inner<T>>,
+}
+
+struct Inner<T> {
+    wake_receiver: DiatomicWaker,
+    wake_sender: DiatomicWaker,
+    value: Mutex<State<T>>,
+}
+
+enum State<T> {
+    NoData,
+    HasData(T),
+    TryFoldFailed, // transient state
+    SenderWaitsForReceiverToConsume(T),
+    SenderGone(Option<T>),
+    ReceiverGone,
+    AllGone,
+    SenderDropping,   // transient state
+    ReceiverDropping, // transient state
+}
+
+pub fn channel<T: Send>() -> (Sender<T>, Receiver<T>) {
+    let inner = Inner {
+        wake_receiver: DiatomicWaker::new(),
+        wake_sender: DiatomicWaker::new(),
+        value: Mutex::new(State::NoData),
+    };
+
+    let state = Arc::new(inner);
+    (
+        Sender {
+            state: state.clone(),
+        },
+        Receiver { state },
+    )
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum SendError {
+    #[error("receiver is gone")]
+    ReceiverGone,
+}
+
+impl<T: Send> Sender<T> {
+    /// # Panics
+    ///
+    /// If `try_fold` panics,  any subsequent call to `send` panic.
+    pub async fn send<F>(&mut self, value: T, try_fold: F) -> Result<(), SendError>
+    where
+        F: Fn(&mut T, T) -> Result<(), T>,
+    {
+        let mut value = Some(value);
+        poll_fn(|cx| {
+            let mut guard = self.state.value.lock().unwrap();
+            match &mut *guard {
+                State::NoData => {
+                    *guard = State::HasData(value.take().unwrap());
+                    self.state.wake_receiver.notify();
+                    Poll::Ready(Ok(()))
+                }
+                State::HasData(_) => {
+                    let State::HasData(acc_mut) = &mut *guard else {
+                        unreachable!("this match arm guarantees that the guard is HasData");
+                    };
+                    match try_fold(acc_mut, value.take().unwrap()) {
+                        Ok(()) => {
+                            // no need to wake receiver, if it was waiting it already
+                            // got a wake-up when we transitioned from NoData to HasData
+                            Poll::Ready(Ok(()))
+                        }
+                        Err(unfoldable_value) => {
+                            value = Some(unfoldable_value);
+                            let State::HasData(acc) =
+                                std::mem::replace(&mut *guard, State::TryFoldFailed)
+                            else {
+                                unreachable!("this match arm guarantees that the guard is HasData");
+                            };
+                            *guard = State::SenderWaitsForReceiverToConsume(acc);
+                            // SAFETY: send is single threaded due to `&mut self` requirement,
+                            // therefore register is not concurrent.
+                            unsafe {
+                                self.state.wake_sender.register(cx.waker());
+                            }
+                            Poll::Pending
+                        }
+                    }
+                }
+                State::SenderWaitsForReceiverToConsume(_data) => {
+                    // Really, we shouldn't be polled until receiver has consumed and wakes us.
+                    Poll::Pending
+                }
+                State::ReceiverGone => Poll::Ready(Err(SendError::ReceiverGone)),
+                State::SenderGone(_)
+                | State::AllGone
+                | State::SenderDropping
+                | State::ReceiverDropping
+                | State::TryFoldFailed => {
+                    unreachable!();
+                }
+            }
+        })
+        .await
+    }
+}
+
+impl<T> Drop for Sender<T> {
+    fn drop(&mut self) {
+        scopeguard::defer! {
+            self.state.wake_receiver.notify()
+        };
+        let Ok(mut guard) = self.state.value.lock() else {
+            return;
+        };
+        *guard = match std::mem::replace(&mut *guard, State::SenderDropping) {
+            State::NoData => State::SenderGone(None),
+            State::HasData(data) | State::SenderWaitsForReceiverToConsume(data) => {
+                State::SenderGone(Some(data))
+            }
+            State::ReceiverGone => State::AllGone,
+            State::TryFoldFailed
+            | State::SenderGone(_)
+            | State::AllGone
+            | State::SenderDropping
+            | State::ReceiverDropping => {
+                unreachable!("unreachable state {:?}", guard.discriminant_str())
+            }
+        }
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum RecvError {
+    #[error("sender is gone")]
+    SenderGone,
+}
+
+impl<T: Send> Receiver<T> {
+    pub async fn recv(&mut self) -> Result<T, RecvError> {
+        poll_fn(|cx| {
+            let mut guard = self.state.value.lock().unwrap();
+            match &mut *guard {
+                State::NoData => {
+                    // SAFETY: recv is single threaded due to `&mut self` requirement,
+                    // therefore register is not concurrent.
+                    unsafe {
+                        self.state.wake_receiver.register(cx.waker());
+                    }
+                    Poll::Pending
+                }
+                guard @ State::HasData(_)
+                | guard @ State::SenderWaitsForReceiverToConsume(_)
+                | guard @ State::SenderGone(Some(_)) => {
+                    let data = guard
+                        .take_data()
+                        .expect("in these states, data is guaranteed to be present");
+                    self.state.wake_sender.notify();
+                    Poll::Ready(Ok(data))
+                }
+                State::SenderGone(None) => Poll::Ready(Err(RecvError::SenderGone)),
+                State::ReceiverGone
+                | State::AllGone
+                | State::SenderDropping
+                | State::ReceiverDropping
+                | State::TryFoldFailed => {
+                    unreachable!("unreachable state {:?}", guard.discriminant_str());
+                }
+            }
+        })
+        .await
+    }
+}
+
+impl<T> Drop for Receiver<T> {
+    fn drop(&mut self) {
+        scopeguard::defer! {
+            self.state.wake_sender.notify()
+        };
+        let Ok(mut guard) = self.state.value.lock() else {
+            return;
+        };
+        *guard = match std::mem::replace(&mut *guard, State::ReceiverDropping) {
+            State::NoData => State::ReceiverGone,
+            State::HasData(_) | State::SenderWaitsForReceiverToConsume(_) => State::ReceiverGone,
+            State::SenderGone(_) => State::AllGone,
+            State::TryFoldFailed
+            | State::ReceiverGone
+            | State::AllGone
+            | State::SenderDropping
+            | State::ReceiverDropping => {
+                unreachable!("unreachable state {:?}", guard.discriminant_str())
+            }
+        }
+    }
+}
+
+impl<T> State<T> {
+    fn take_data(&mut self) -> Option<T> {
+        match self {
+            State::HasData(_) => {
+                let State::HasData(data) = std::mem::replace(self, State::NoData) else {
+                    unreachable!("this match arm guarantees that the state is HasData");
+                };
+                Some(data)
+            }
+            State::SenderWaitsForReceiverToConsume(_) => {
+                let State::SenderWaitsForReceiverToConsume(data) =
+                    std::mem::replace(self, State::NoData)
+                else {
+                    unreachable!(
+                        "this match arm guarantees that the state is SenderWaitsForReceiverToConsume"
+                    );
+                };
+                Some(data)
+            }
+            State::SenderGone(data) => Some(data.take().unwrap()),
+            State::NoData
+            | State::TryFoldFailed
+            | State::ReceiverGone
+            | State::AllGone
+            | State::SenderDropping
+            | State::ReceiverDropping => None,
+        }
+    }
+    fn discriminant_str(&self) -> &'static str {
+        match self {
+            State::NoData => "NoData",
+            State::HasData(_) => "HasData",
+            State::TryFoldFailed => "TryFoldFailed",
+            State::SenderWaitsForReceiverToConsume(_) => "SenderWaitsForReceiverToConsume",
+            State::SenderGone(_) => "SenderGone",
+            State::ReceiverGone => "ReceiverGone",
+            State::AllGone => "AllGone",
+            State::SenderDropping => "SenderDropping",
+            State::ReceiverDropping => "ReceiverDropping",
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    const FOREVER: std::time::Duration = std::time::Duration::from_secs(u64::MAX);
+
+    #[tokio::test]
+    async fn test_send_recv() {
+        let (mut sender, mut receiver) = channel();
+
+        sender
+            .send(42, |acc, val| {
+                *acc += val;
+                Ok(())
+            })
+            .await
+            .unwrap();
+
+        let received = receiver.recv().await.unwrap();
+        assert_eq!(received, 42);
+    }
+
+    #[tokio::test]
+    async fn test_send_recv_with_fold() {
+        let (mut sender, mut receiver) = channel();
+
+        sender
+            .send(1, |acc, val| {
+                *acc += val;
+                Ok(())
+            })
+            .await
+            .unwrap();
+        sender
+            .send(2, |acc, val| {
+                *acc += val;
+                Ok(())
+            })
+            .await
+            .unwrap();
+
+        let received = receiver.recv().await.unwrap();
+        assert_eq!(received, 3);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_sender_waits_for_receiver_if_try_fold_fails() {
+        let (mut sender, mut receiver) = channel();
+
+        sender.send(23, |_, _| panic!("first send")).await.unwrap();
+
+        let send_fut = sender.send(42, |_, val| Err(val));
+        let mut send_fut = std::pin::pin!(send_fut);
+
+        tokio::select! {
+            _ = tokio::time::sleep(FOREVER) => {},
+            _ = &mut send_fut => {
+                panic!("send should not complete");
+            },
+        }
+
+        let val = receiver.recv().await.unwrap();
+        assert_eq!(val, 23);
+
+        tokio::select! {
+            _ = tokio::time::sleep(FOREVER) => {
+                panic!("receiver should have consumed the value");
+            },
+            _ = &mut send_fut => { },
+        }
+
+        let val = receiver.recv().await.unwrap();
+        assert_eq!(val, 42);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_sender_errors_if_waits_for_receiver_and_receiver_drops() {
+        let (mut sender, receiver) = channel();
+
+        sender.send(23, |_, _| unreachable!()).await.unwrap();
+
+        let send_fut = sender.send(42, |_, val| Err(val));
+        let send_fut = std::pin::pin!(send_fut);
+
+        drop(receiver);
+
+        let result = send_fut.await;
+        assert!(matches!(result, Err(SendError::ReceiverGone)));
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_receiver_errors_if_waits_for_sender_and_sender_drops() {
+        let (sender, mut receiver) = channel::<()>();
+
+        let recv_fut = receiver.recv();
+        let recv_fut = std::pin::pin!(recv_fut);
+
+        drop(sender);
+
+        let result = recv_fut.await;
+        assert!(matches!(result, Err(RecvError::SenderGone)));
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_receiver_errors_if_waits_for_sender_and_sender_drops_with_data() {
+        let (mut sender, mut receiver) = channel();
+
+        sender.send(42, |_, _| unreachable!()).await.unwrap();
+
+        {
+            let recv_fut = receiver.recv();
+            let recv_fut = std::pin::pin!(recv_fut);
+
+            drop(sender);
+
+            let val = recv_fut.await.unwrap();
+            assert_eq!(val, 42);
+        }
+
+        let result = receiver.recv().await;
+        assert!(matches!(result, Err(RecvError::SenderGone)));
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_receiver_waits_for_sender_if_no_data() {
+        let (mut sender, mut receiver) = channel();
+
+        let recv_fut = receiver.recv();
+        let mut recv_fut = std::pin::pin!(recv_fut);
+
+        tokio::select! {
+            _ = tokio::time::sleep(FOREVER) => {},
+            _ = &mut recv_fut => {
+                panic!("recv should not complete");
+            },
+        }
+
+        sender.send(42, |_, _| Ok(())).await.unwrap();
+
+        let val = recv_fut.await.unwrap();
+        assert_eq!(val, 42);
+    }
+
+    #[tokio::test]
+    async fn test_receiver_gone_while_nodata() {
+        let (mut sender, receiver) = channel();
+        drop(receiver);
+
+        let result = sender.send(42, |_, _| Ok(())).await;
+        assert!(matches!(result, Err(SendError::ReceiverGone)));
+    }
+
+    #[tokio::test]
+    async fn test_sender_gone_while_nodata() {
+        let (sender, mut receiver) = super::channel::<usize>();
+        drop(sender);
+
+        let result = receiver.recv().await;
+        assert!(matches!(result, Err(RecvError::SenderGone)));
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_receiver_drops_after_sender_went_to_sleep() {
+        let (mut sender, receiver) = channel();
+        let state = receiver.state.clone();
+
+        sender.send(23, |_, _| unreachable!()).await.unwrap();
+
+        let send_task = tokio::spawn(async move { sender.send(42, |_, v| Err(v)).await });
+
+        tokio::time::sleep(FOREVER).await;
+
+        assert!(matches!(
+            &*state.value.lock().unwrap(),
+            &State::SenderWaitsForReceiverToConsume(_)
+        ));
+
+        drop(receiver);
+
+        let err = send_task
+            .await
+            .unwrap()
+            .expect_err("should unblock immediately");
+        assert!(matches!(err, SendError::ReceiverGone));
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_sender_drops_after_receiver_went_to_sleep() {
+        let (sender, mut receiver) = channel::<usize>();
+        let state = sender.state.clone();
+
+        let recv_task = tokio::spawn(async move { receiver.recv().await });
+
+        tokio::time::sleep(FOREVER).await;
+
+        assert!(matches!(&*state.value.lock().unwrap(), &State::NoData));
+
+        drop(sender);
+
+        let err = recv_task.await.unwrap().expect_err("should error");
+        assert!(matches!(err, RecvError::SenderGone));
+    }
+}
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 2cf237e72b..1651db8500 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -188,11 +188,9 @@ pub struct PageServerConf {
     /// Optionally disable disk syncs (unsafe!)
     pub no_sync: bool,
 
-    /// Maximum amount of time for which a get page request request
-    /// might be held up for request merging.
-    pub server_side_batch_timeout: Option<Duration>,
-
     pub wal_receiver_protocol: PostgresClientProtocol,
+
+    pub page_service_pipelining: pageserver_api::config::PageServicePipeliningConfig,
 }
 
 /// Token for authentication to safekeepers
@@ -350,10 +348,10 @@ impl PageServerConf {
             concurrent_tenant_warmup,
             concurrent_tenant_size_logical_size_queries,
             virtual_file_io_engine,
-            server_side_batch_timeout,
             tenant_config,
             no_sync,
             wal_receiver_protocol,
+            page_service_pipelining,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -393,11 +391,11 @@ impl PageServerConf {
             image_compression,
             timeline_offloading,
             ephemeral_bytes_per_memory_kb,
-            server_side_batch_timeout,
             import_pgdata_upcall_api,
             import_pgdata_upcall_api_token: import_pgdata_upcall_api_token.map(SecretString::from),
             import_pgdata_aws_endpoint_url,
             wal_receiver_protocol,
+            page_service_pipelining,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index ef6711397a..ff6af3566c 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -356,6 +356,25 @@ async fn timed<Fut: std::future::Future>(
     }
 }
 
+/// Like [`timed`], but the warning timeout only starts after `cancel` has been cancelled.
+async fn timed_after_cancellation<Fut: std::future::Future>(
+    fut: Fut,
+    name: &str,
+    warn_at: std::time::Duration,
+    cancel: &CancellationToken,
+) -> <Fut as std::future::Future>::Output {
+    let mut fut = std::pin::pin!(fut);
+
+    tokio::select! {
+        _ = cancel.cancelled() => {
+            timed(fut, name, warn_at).await
+        }
+        ret = &mut fut => {
+            ret
+        }
+    }
+}
+
 #[cfg(test)]
 mod timed_tests {
     use super::timed;
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 5fd02d8749..1917e7f5b7 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -7,6 +7,10 @@ use bytes::Buf;
 use futures::FutureExt;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
+use pageserver_api::config::{
+    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
+    PageServiceProtocolPipelinedExecutionStrategy,
+};
 use pageserver_api::models::{self, TenantState};
 use pageserver_api::models::{
     PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
@@ -16,12 +20,15 @@ use pageserver_api::models::{
     PagestreamProtocolVersion,
 };
 use pageserver_api::shard::TenantShardId;
-use postgres_backend::{is_expected_io_error, AuthType, PostgresBackend, QueryError};
+use postgres_backend::{
+    is_expected_io_error, AuthType, PostgresBackend, PostgresBackendReader, QueryError,
+};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
 use std::borrow::Cow;
 use std::io;
+use std::num::NonZeroUsize;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -32,6 +39,7 @@ use tokio::io::{AsyncWriteExt, BufWriter};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::sync::spsc_fold;
 use utils::{
     auth::{Claims, Scope, SwappableJwtAuth},
     id::{TenantId, TimelineId},
@@ -40,7 +48,6 @@ use utils::{
 };
 
 use crate::auth::check_permission;
-use crate::basebackup;
 use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -58,6 +65,7 @@ use crate::tenant::timeline::{self, WaitLsnError};
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
+use crate::{basebackup, timed_after_cancellation};
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
@@ -105,7 +113,7 @@ pub fn spawn(
             pg_auth,
             tcp_listener,
             conf.pg_auth_type,
-            conf.server_side_batch_timeout,
+            conf.page_service_pipelining.clone(),
             libpq_ctx,
             cancel.clone(),
         )
@@ -154,7 +162,7 @@ pub async fn libpq_listener_main(
     auth: Option<Arc<SwappableJwtAuth>>,
     listener: tokio::net::TcpListener,
     auth_type: AuthType,
-    server_side_batch_timeout: Option<Duration>,
+    pipelining_config: PageServicePipeliningConfig,
     listener_ctx: RequestContext,
     listener_cancel: CancellationToken,
 ) -> Connections {
@@ -185,7 +193,7 @@ pub async fn libpq_listener_main(
                     local_auth,
                     socket,
                     auth_type,
-                    server_side_batch_timeout,
+                    pipelining_config.clone(),
                     connection_ctx,
                     connections_cancel.child_token(),
                 ));
@@ -213,7 +221,7 @@ async fn page_service_conn_main(
     auth: Option<Arc<SwappableJwtAuth>>,
     socket: tokio::net::TcpStream,
     auth_type: AuthType,
-    server_side_batch_timeout: Option<Duration>,
+    pipelining_config: PageServicePipeliningConfig,
     connection_ctx: RequestContext,
     cancel: CancellationToken,
 ) -> ConnectionHandlerResult {
@@ -256,7 +264,7 @@ async fn page_service_conn_main(
     // a while: we will tear down this PageServerHandler and instantiate a new one if/when
     // they reconnect.
     socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
-    let socket = std::pin::pin!(socket);
+    let socket = Box::pin(socket);
 
     fail::fail_point!("ps::connection-start::pre-login");
 
@@ -267,7 +275,7 @@ async fn page_service_conn_main(
     let mut conn_handler = PageServerHandler::new(
         tenant_manager,
         auth,
-        server_side_batch_timeout,
+        pipelining_config,
         connection_ctx,
         cancel.clone(),
     );
@@ -283,7 +291,7 @@ async fn page_service_conn_main(
                 info!("Postgres client disconnected ({io_error})");
                 Ok(())
             } else {
-                let tenant_id = conn_handler.timeline_handles.tenant_id();
+                let tenant_id = conn_handler.timeline_handles.as_ref().unwrap().tenant_id();
                 Err(io_error).context(format!(
                     "Postgres connection error for tenant_id={:?} client at peer_addr={}",
                     tenant_id, peer_addr
@@ -291,7 +299,7 @@ async fn page_service_conn_main(
             }
         }
         other => {
-            let tenant_id = conn_handler.timeline_handles.tenant_id();
+            let tenant_id = conn_handler.timeline_handles.as_ref().unwrap().tenant_id();
             other.context(format!(
                 "Postgres query error for tenant_id={:?} client peer_addr={}",
                 tenant_id, peer_addr
@@ -312,13 +320,10 @@ struct PageServerHandler {
 
     cancel: CancellationToken,
 
-    timeline_handles: TimelineHandles,
+    /// None only while pagestream protocol is being processed.
+    timeline_handles: Option<TimelineHandles>,
 
-    /// Messages queued up for the next processing batch
-    next_batch: Option<BatchedFeMessage>,
-
-    /// See [`PageServerConf::server_side_batch_timeout`]
-    server_side_batch_timeout: Option<Duration>,
+    pipelining_config: PageServicePipeliningConfig,
 }
 
 struct TimelineHandles {
@@ -535,10 +540,12 @@ impl From<WaitLsnError> for QueryError {
 enum BatchedFeMessage {
     Exists {
         span: Span,
+        shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamExistsRequest,
     },
     Nblocks {
         span: Span,
+        shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamNblocksRequest,
     },
     GetPage {
@@ -549,10 +556,12 @@ enum BatchedFeMessage {
     },
     DbSize {
         span: Span,
+        shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamDbSizeRequest,
     },
     GetSlruSegment {
         span: Span,
+        shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamGetSlruSegmentRequest,
     },
     RespondError {
@@ -561,18 +570,11 @@ enum BatchedFeMessage {
     },
 }
 
-enum BatchOrEof {
-    /// In the common case, this has one entry.
-    /// At most, it has two entries: the first is the leftover batch, the second is an error.
-    Batch(smallvec::SmallVec<[BatchedFeMessage; 1]>),
-    Eof,
-}
-
 impl PageServerHandler {
     pub fn new(
         tenant_manager: Arc<TenantManager>,
         auth: Option<Arc<SwappableJwtAuth>>,
-        server_side_batch_timeout: Option<Duration>,
+        pipelining_config: PageServicePipeliningConfig,
         connection_ctx: RequestContext,
         cancel: CancellationToken,
     ) -> Self {
@@ -580,10 +582,9 @@ impl PageServerHandler {
             auth,
             claims: None,
             connection_ctx,
-            timeline_handles: TimelineHandles::new(tenant_manager),
+            timeline_handles: Some(TimelineHandles::new(tenant_manager)),
             cancel,
-            next_batch: None,
-            server_side_batch_timeout,
+            pipelining_config,
         }
     }
 
@@ -611,219 +612,356 @@ impl PageServerHandler {
         )
     }
 
-    async fn read_batch_from_connection<IO>(
-        &mut self,
-        pgb: &mut PostgresBackend<IO>,
-        tenant_id: &TenantId,
-        timeline_id: &TimelineId,
+    async fn pagestream_read_message<IO>(
+        pgb: &mut PostgresBackendReader<IO>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        timeline_handles: &mut TimelineHandles,
+        cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> Result<Option<BatchOrEof>, QueryError>
+        parent_span: Span,
+    ) -> Result<Option<BatchedFeMessage>, QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
+    {
+        let msg = tokio::select! {
+            biased;
+            _ = cancel.cancelled() => {
+                return Err(QueryError::Shutdown)
+            }
+            msg = pgb.read_message() => { msg }
+        };
+
+        let copy_data_bytes = match msg? {
+            Some(FeMessage::CopyData(bytes)) => bytes,
+            Some(FeMessage::Terminate) => {
+                return Ok(None);
+            }
+            Some(m) => {
+                return Err(QueryError::Other(anyhow::anyhow!(
+                    "unexpected message: {m:?} during COPY"
+                )));
+            }
+            None => {
+                return Ok(None);
+            } // client disconnected
+        };
+        trace!("query: {copy_data_bytes:?}");
+
+        fail::fail_point!("ps::handle-pagerequest-message");
+
+        // parse request
+        let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
+
+        let batched_msg = match neon_fe_msg {
+            PagestreamFeMessage::Exists(req) => {
+                let span = tracing::info_span!(parent: parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn);
+                let shard = timeline_handles
+                    .get(tenant_id, timeline_id, ShardSelector::Zero)
+                    .instrument(span.clone()) // sets `shard_id` field
+                    .await?;
+                BatchedFeMessage::Exists { span, shard, req }
+            }
+            PagestreamFeMessage::Nblocks(req) => {
+                let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
+                let shard = timeline_handles
+                    .get(tenant_id, timeline_id, ShardSelector::Zero)
+                    .instrument(span.clone()) // sets `shard_id` field
+                    .await?;
+                BatchedFeMessage::Nblocks { span, shard, req }
+            }
+            PagestreamFeMessage::DbSize(req) => {
+                let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
+                let shard = timeline_handles
+                    .get(tenant_id, timeline_id, ShardSelector::Zero)
+                    .instrument(span.clone()) // sets `shard_id` field
+                    .await?;
+                BatchedFeMessage::DbSize { span, shard, req }
+            }
+            PagestreamFeMessage::GetSlruSegment(req) => {
+                let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
+                let shard = timeline_handles
+                    .get(tenant_id, timeline_id, ShardSelector::Zero)
+                    .instrument(span.clone()) // sets `shard_id` field
+                    .await?;
+                BatchedFeMessage::GetSlruSegment { span, shard, req }
+            }
+            PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                request_lsn,
+                not_modified_since,
+                rel,
+                blkno,
+            }) => {
+                let span = tracing::info_span!(parent: parent_span, "handle_get_page_at_lsn_request_batched", req_lsn = %request_lsn);
+
+                macro_rules! respond_error {
+                    ($error:expr) => {{
+                        let error = BatchedFeMessage::RespondError {
+                            span,
+                            error: $error,
+                        };
+                        Ok(Some(error))
+                    }};
+                }
+
+                let key = rel_block_to_key(rel, blkno);
+                let shard = match timeline_handles
+                    .get(tenant_id, timeline_id, ShardSelector::Page(key))
+                    .instrument(span.clone()) // sets `shard_id` field
+                    .await
+                {
+                    Ok(tl) => tl,
+                    Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
+                        // We already know this tenant exists in general, because we resolved it at
+                        // start of connection.  Getting a NotFound here indicates that the shard containing
+                        // the requested page is not present on this node: the client's knowledge of shard->pageserver
+                        // mapping is out of date.
+                        //
+                        // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
+                        // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
+                        // and talk to a different pageserver.
+                        return respond_error!(PageStreamError::Reconnect(
+                            "getpage@lsn request routed to wrong shard".into()
+                        ));
+                    }
+                    Err(e) => {
+                        return respond_error!(e.into());
+                    }
+                };
+                let effective_request_lsn = match Self::wait_or_get_last_lsn(
+                    &shard,
+                    request_lsn,
+                    not_modified_since,
+                    &shard.get_latest_gc_cutoff_lsn(),
+                    ctx,
+                )
+                // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
+                .await
+                {
+                    Ok(lsn) => lsn,
+                    Err(e) => {
+                        return respond_error!(e);
+                    }
+                };
+                BatchedFeMessage::GetPage {
+                    span,
+                    shard,
+                    effective_request_lsn,
+                    pages: smallvec::smallvec![(rel, blkno)],
+                }
+            }
+        };
+        Ok(Some(batched_msg))
+    }
+
+    /// Post-condition: `batch` is Some()
+    #[instrument(skip_all, level = tracing::Level::TRACE)]
+    #[allow(clippy::boxed_local)]
+    fn pagestream_do_batch(
+        max_batch_size: NonZeroUsize,
+        batch: &mut Result<BatchedFeMessage, QueryError>,
+        this_msg: Result<BatchedFeMessage, QueryError>,
+    ) -> Result<(), Result<BatchedFeMessage, QueryError>> {
+        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
+
+        let this_msg = match this_msg {
+            Ok(this_msg) => this_msg,
+            Err(e) => return Err(Err(e)),
+        };
+
+        match (&mut *batch, this_msg) {
+            // something batched already, let's see if we can add this message to the batch
+            (
+                Ok(BatchedFeMessage::GetPage {
+                    span: _,
+                    shard: accum_shard,
+                    pages: ref mut accum_pages,
+                    effective_request_lsn: accum_lsn,
+                }),
+                BatchedFeMessage::GetPage {
+                    span: _,
+                    shard: this_shard,
+                    pages: this_pages,
+                    effective_request_lsn: this_lsn,
+                },
+            ) if (|| {
+                assert_eq!(this_pages.len(), 1);
+                if accum_pages.len() >= max_batch_size.get() {
+                    trace!(%accum_lsn, %this_lsn, %max_batch_size, "stopping batching because of batch size");
+                    assert_eq!(accum_pages.len(), max_batch_size.get());
+                    return false;
+                }
+                if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
+                    != (this_shard.tenant_shard_id, this_shard.timeline_id)
+                {
+                    trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch");
+                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
+                    // But the current logic for keeping responses in order does not support that.
+                    return false;
+                }
+                // the vectored get currently only supports a single LSN, so, bounce as soon
+                // as the effective request_lsn changes
+                if *accum_lsn != this_lsn {
+                    trace!(%accum_lsn, %this_lsn, "stopping batching because LSN changed");
+                    return false;
+                }
+                true
+            })() =>
+            {
+                // ok to batch
+                accum_pages.extend(this_pages);
+                Ok(())
+            }
+            // something batched already but this message is unbatchable
+            (_, this_msg) => {
+                // by default, don't continue batching
+                Err(Ok(this_msg))
+            }
+        }
+    }
+
+    #[instrument(level = tracing::Level::DEBUG, skip_all)]
+    async fn pagesteam_handle_batched_message<IO>(
+        &mut self,
+        pgb_writer: &mut PostgresBackend<IO>,
+        batch: BatchedFeMessage,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> Result<(), QueryError>
     where
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
-        let mut batch = self.next_batch.take();
-        let mut batch_started_at: Option<std::time::Instant> = None;
-
-        let next_batch: Option<BatchedFeMessage> = loop {
-            let sleep_fut = match (self.server_side_batch_timeout, batch_started_at) {
-                (Some(batch_timeout), Some(started_at)) => futures::future::Either::Left(
-                    tokio::time::sleep_until((started_at + batch_timeout).into()),
-                ),
-                _ => futures::future::Either::Right(futures::future::pending()),
-            };
-
-            let msg = tokio::select! {
-                biased;
-                _ = self.cancel.cancelled() => {
-                    return Err(QueryError::Shutdown)
-                }
-                msg = pgb.read_message() => {
-                    msg
-                }
-                _ = sleep_fut => {
-                    assert!(batch.is_some());
-                    break None;
-                }
-            };
-            let copy_data_bytes = match msg? {
-                Some(FeMessage::CopyData(bytes)) => bytes,
-                Some(FeMessage::Terminate) => {
-                    return Ok(Some(BatchOrEof::Eof));
-                }
-                Some(m) => {
-                    return Err(QueryError::Other(anyhow::anyhow!(
-                        "unexpected message: {m:?} during COPY"
-                    )));
-                }
-                None => {
-                    return Ok(Some(BatchOrEof::Eof));
-                } // client disconnected
-            };
-            trace!("query: {copy_data_bytes:?}");
-            fail::fail_point!("ps::handle-pagerequest-message");
-
-            // parse request
-            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
-
-            let this_msg = match neon_fe_msg {
-                PagestreamFeMessage::Exists(req) => BatchedFeMessage::Exists {
-                    span: tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.request_lsn),
-                    req,
-                },
-                PagestreamFeMessage::Nblocks(req) => BatchedFeMessage::Nblocks {
-                    span: tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn),
-                    req,
-                },
-                PagestreamFeMessage::DbSize(req) => BatchedFeMessage::DbSize {
-                    span: tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn),
-                    req,
-                },
-                PagestreamFeMessage::GetSlruSegment(req) => BatchedFeMessage::GetSlruSegment {
-                    span: tracing::info_span!("handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn),
-                    req,
-                },
-                PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                    request_lsn,
-                    not_modified_since,
-                    rel,
-                    blkno,
-                }) => {
-                    // shard_id is filled in by the handler
-                    let span = tracing::info_span!(
-                        "handle_get_page_at_lsn_request_batched",
-                        %tenant_id, %timeline_id, shard_id = tracing::field::Empty, req_lsn = %request_lsn,
-                        batch_size = tracing::field::Empty, batch_id = tracing::field::Empty
-                    );
-
-                    macro_rules! current_batch_and_error {
-                        ($error:expr) => {{
-                            let error = BatchedFeMessage::RespondError {
-                                span,
-                                error: $error,
-                            };
-                            let batch_and_error = match batch {
-                                Some(b) => smallvec::smallvec![b, error],
-                                None => smallvec::smallvec![error],
-                            };
-                            Ok(Some(BatchOrEof::Batch(batch_and_error)))
-                        }};
-                    }
-
-                    let key = rel_block_to_key(rel, blkno);
-                    let shard = match self
-                        .timeline_handles
-                        .get(*tenant_id, *timeline_id, ShardSelector::Page(key))
-                        .instrument(span.clone())
-                        .await
-                    {
-                        Ok(tl) => tl,
-                        Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
-                            // We already know this tenant exists in general, because we resolved it at
-                            // start of connection.  Getting a NotFound here indicates that the shard containing
-                            // the requested page is not present on this node: the client's knowledge of shard->pageserver
-                            // mapping is out of date.
-                            //
-                            // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
-                            // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
-                            // and talk to a different pageserver.
-                            return current_batch_and_error!(PageStreamError::Reconnect(
-                                "getpage@lsn request routed to wrong shard".into()
-                            ));
-                        }
-                        Err(e) => {
-                            return current_batch_and_error!(e.into());
-                        }
-                    };
-                    let effective_request_lsn = match Self::wait_or_get_last_lsn(
-                        &shard,
-                        request_lsn,
-                        not_modified_since,
-                        &shard.get_latest_gc_cutoff_lsn(),
-                        ctx,
-                    )
-                    // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
-                    .await
-                    {
-                        Ok(lsn) => lsn,
-                        Err(e) => {
-                            return current_batch_and_error!(e);
-                        }
-                    };
-                    BatchedFeMessage::GetPage {
+        // invoke handler function
+        let (handler_results, span): (Vec<Result<PagestreamBeMessage, PageStreamError>>, _) =
+            match batch {
+                BatchedFeMessage::Exists { span, shard, req } => {
+                    fail::fail_point!("ps::handle-pagerequest-message::exists");
+                    (
+                        vec![
+                            self.handle_get_rel_exists_request(&shard, &req, ctx)
+                                .instrument(span.clone())
+                                .await,
+                        ],
                         span,
-                        shard,
-                        effective_request_lsn,
-                        pages: smallvec::smallvec![(rel, blkno)],
-                    }
+                    )
+                }
+                BatchedFeMessage::Nblocks { span, shard, req } => {
+                    fail::fail_point!("ps::handle-pagerequest-message::nblocks");
+                    (
+                        vec![
+                            self.handle_get_nblocks_request(&shard, &req, ctx)
+                                .instrument(span.clone())
+                                .await,
+                        ],
+                        span,
+                    )
+                }
+                BatchedFeMessage::GetPage {
+                    span,
+                    shard,
+                    effective_request_lsn,
+                    pages,
+                } => {
+                    fail::fail_point!("ps::handle-pagerequest-message::getpage");
+                    (
+                        {
+                            let npages = pages.len();
+                            trace!(npages, "handling getpage request");
+                            let res = self
+                                .handle_get_page_at_lsn_request_batched(
+                                    &shard,
+                                    effective_request_lsn,
+                                    pages,
+                                    ctx,
+                                )
+                                .instrument(span.clone())
+                                .await;
+                            assert_eq!(res.len(), npages);
+                            res
+                        },
+                        span,
+                    )
+                }
+                BatchedFeMessage::DbSize { span, shard, req } => {
+                    fail::fail_point!("ps::handle-pagerequest-message::dbsize");
+                    (
+                        vec![
+                            self.handle_db_size_request(&shard, &req, ctx)
+                                .instrument(span.clone())
+                                .await,
+                        ],
+                        span,
+                    )
+                }
+                BatchedFeMessage::GetSlruSegment { span, shard, req } => {
+                    fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
+                    (
+                        vec![
+                            self.handle_get_slru_segment_request(&shard, &req, ctx)
+                                .instrument(span.clone())
+                                .await,
+                        ],
+                        span,
+                    )
+                }
+                BatchedFeMessage::RespondError { span, error } => {
+                    // We've already decided to respond with an error, so we don't need to
+                    // call the handler.
+                    (vec![Err(error)], span)
                 }
             };
 
-            let batch_timeout = match self.server_side_batch_timeout {
-                Some(value) => value,
-                None => {
-                    // Batching is not enabled - stop on the first message.
-                    return Ok(Some(BatchOrEof::Batch(smallvec::smallvec![this_msg])));
-                }
+        // Map handler result to protocol behavior.
+        // Some handler errors cause exit from pagestream protocol.
+        // Other handler errors are sent back as an error message and we stay in pagestream protocol.
+        for handler_result in handler_results {
+            let response_msg = match handler_result {
+                Err(e) => match &e {
+                    PageStreamError::Shutdown => {
+                        // If we fail to fulfil a request during shutdown, which may be _because_ of
+                        // shutdown, then do not send the error to the client.  Instead just drop the
+                        // connection.
+                        span.in_scope(|| info!("dropping connection due to shutdown"));
+                        return Err(QueryError::Shutdown);
+                    }
+                    PageStreamError::Reconnect(reason) => {
+                        span.in_scope(|| info!("handler requested reconnect: {reason}"));
+                        return Err(QueryError::Reconnect);
+                    }
+                    PageStreamError::Read(_)
+                    | PageStreamError::LsnTimeout(_)
+                    | PageStreamError::NotFound(_)
+                    | PageStreamError::BadRequest(_) => {
+                        // print the all details to the log with {:#}, but for the client the
+                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
+                        // here includes cancellation which is not an error.
+                        let full = utils::error::report_compact_sources(&e);
+                        span.in_scope(|| {
+                            error!("error reading relation or page version: {full:#}")
+                        });
+                        PagestreamBeMessage::Error(PagestreamErrorResponse {
+                            message: e.to_string(),
+                        })
+                    }
+                },
+                Ok(response_msg) => response_msg,
             };
 
-            // check if we can batch
-            match (&mut batch, this_msg) {
-                (None, this_msg) => {
-                    batch = Some(this_msg);
-                }
-                (
-                    Some(BatchedFeMessage::GetPage {
-                        span: _,
-                        shard: accum_shard,
-                        pages: accum_pages,
-                        effective_request_lsn: accum_lsn,
-                    }),
-                    BatchedFeMessage::GetPage {
-                        span: _,
-                        shard: this_shard,
-                        pages: this_pages,
-                        effective_request_lsn: this_lsn,
-                    },
-                ) if async {
-                    assert_eq!(this_pages.len(), 1);
-                    if accum_pages.len() >= Timeline::MAX_GET_VECTORED_KEYS as usize {
-                        assert_eq!(accum_pages.len(), Timeline::MAX_GET_VECTORED_KEYS as usize);
-                        return false;
-                    }
-                    if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
-                        != (this_shard.tenant_shard_id, this_shard.timeline_id)
-                    {
-                        // TODO: we _could_ batch & execute each shard seperately (and in parallel).
-                        // But the current logic for keeping responses in order does not support that.
-                        return false;
-                    }
-                    // the vectored get currently only supports a single LSN, so, bounce as soon
-                    // as the effective request_lsn changes
-                    if *accum_lsn != this_lsn {
-                        return false;
-                    }
-                    true
-                }
-                .await =>
-                {
-                    // ok to batch
-                    accum_pages.extend(this_pages);
-                }
-                (Some(_), this_msg) => {
-                    // by default, don't continue batching
-                    break Some(this_msg);
-                }
+            // marshal & transmit response message
+            pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
+        }
+        tokio::select! {
+            biased;
+            _ = cancel.cancelled() => {
+                // We were requested to shut down.
+                info!("shutdown request received in page handler");
+                return Err(QueryError::Shutdown)
             }
-
-            // batching impl piece
-            let started_at = batch_started_at.get_or_insert_with(Instant::now);
-            if started_at.elapsed() > batch_timeout {
-                break None;
+            res = pgb_writer.flush() => {
+                res?;
             }
-        };
-
-        self.next_batch = next_batch;
-        Ok(batch.map(|b| BatchOrEof::Batch(smallvec::smallvec![b])))
+        }
+        Ok(())
     }
 
     /// Pagestream sub-protocol handler.
@@ -845,7 +983,7 @@ impl PageServerHandler {
         ctx: RequestContext,
     ) -> Result<(), QueryError>
     where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
     {
         debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
 
@@ -861,169 +999,283 @@ impl PageServerHandler {
             }
         }
 
-        // If [`PageServerHandler`] is reused for multiple pagestreams,
-        // then make sure to not process requests from the previous ones.
-        self.next_batch = None;
+        let pgb_reader = pgb
+            .split()
+            .context("implementation error: split pgb into reader and writer")?;
 
-        loop {
-            let maybe_batched = self
-                .read_batch_from_connection(pgb, &tenant_id, &timeline_id, &ctx)
-                .await?;
-            let batched = match maybe_batched {
-                Some(BatchOrEof::Batch(b)) => b,
-                Some(BatchOrEof::Eof) => {
-                    break;
-                }
+        let timeline_handles = self
+            .timeline_handles
+            .take()
+            .expect("implementation error: timeline_handles should not be locked");
+
+        let request_span = info_span!("request", shard_id = tracing::field::Empty);
+        let ((pgb_reader, timeline_handles), result) = match self.pipelining_config.clone() {
+            PageServicePipeliningConfig::Pipelined(pipelining_config) => {
+                self.handle_pagerequests_pipelined(
+                    pgb,
+                    pgb_reader,
+                    tenant_id,
+                    timeline_id,
+                    timeline_handles,
+                    request_span,
+                    pipelining_config,
+                    &ctx,
+                )
+                .await
+            }
+            PageServicePipeliningConfig::Serial => {
+                self.handle_pagerequests_serial(
+                    pgb,
+                    pgb_reader,
+                    tenant_id,
+                    timeline_id,
+                    timeline_handles,
+                    request_span,
+                    &ctx,
+                )
+                .await
+            }
+        };
+
+        debug!("pagestream subprotocol shut down cleanly");
+
+        pgb.unsplit(pgb_reader)
+            .context("implementation error: unsplit pgb")?;
+
+        let replaced = self.timeline_handles.replace(timeline_handles);
+        assert!(replaced.is_none());
+
+        result
+    }
+
+    #[allow(clippy::too_many_arguments)]
+    async fn handle_pagerequests_serial<IO>(
+        &mut self,
+        pgb_writer: &mut PostgresBackend<IO>,
+        mut pgb_reader: PostgresBackendReader<IO>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        mut timeline_handles: TimelineHandles,
+        request_span: Span,
+        ctx: &RequestContext,
+    ) -> (
+        (PostgresBackendReader<IO>, TimelineHandles),
+        Result<(), QueryError>,
+    )
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
+    {
+        let cancel = self.cancel.clone();
+        let err = loop {
+            let msg = Self::pagestream_read_message(
+                &mut pgb_reader,
+                tenant_id,
+                timeline_id,
+                &mut timeline_handles,
+                &cancel,
+                ctx,
+                request_span.clone(),
+            )
+            .await;
+            let msg = match msg {
+                Ok(msg) => msg,
+                Err(e) => break e,
+            };
+            let msg = match msg {
+                Some(msg) => msg,
                 None => {
-                    continue;
+                    debug!("pagestream subprotocol end observed");
+                    return ((pgb_reader, timeline_handles), Ok(()));
                 }
             };
+            let err = self
+                .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, ctx)
+                .await;
+            match err {
+                Ok(()) => {}
+                Err(e) => break e,
+            }
+        };
+        ((pgb_reader, timeline_handles), Err(err))
+    }
 
-            for batch in batched {
-                // invoke handler function
-                let (handler_results, span): (
-                    Vec<Result<PagestreamBeMessage, PageStreamError>>,
-                    _,
-                ) = match batch {
-                    BatchedFeMessage::Exists { span, req } => {
-                        fail::fail_point!("ps::handle-pagerequest-message::exists");
-                        (
-                            vec![
-                                self.handle_get_rel_exists_request(
-                                    tenant_id,
-                                    timeline_id,
-                                    &req,
-                                    &ctx,
-                                )
-                                .instrument(span.clone())
-                                .await,
-                            ],
-                            span,
-                        )
-                    }
-                    BatchedFeMessage::Nblocks { span, req } => {
-                        fail::fail_point!("ps::handle-pagerequest-message::nblocks");
-                        (
-                            vec![
-                                self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
-                                    .instrument(span.clone())
-                                    .await,
-                            ],
-                            span,
-                        )
-                    }
-                    BatchedFeMessage::GetPage {
-                        span,
-                        shard,
-                        effective_request_lsn,
-                        pages,
-                    } => {
-                        fail::fail_point!("ps::handle-pagerequest-message::getpage");
-                        (
-                            {
-                                let npages = pages.len();
-                                let res = self
-                                    .handle_get_page_at_lsn_request_batched(
-                                        &shard,
-                                        effective_request_lsn,
-                                        pages,
-                                        &ctx,
-                                    )
-                                    .instrument(span.clone())
-                                    .await;
-                                assert_eq!(res.len(), npages);
-                                res
-                            },
-                            span,
-                        )
-                    }
-                    BatchedFeMessage::DbSize { span, req } => {
-                        fail::fail_point!("ps::handle-pagerequest-message::dbsize");
-                        (
-                            vec![
-                                self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
-                                    .instrument(span.clone())
-                                    .await,
-                            ],
-                            span,
-                        )
-                    }
-                    BatchedFeMessage::GetSlruSegment { span, req } => {
-                        fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
-                        (
-                            vec![
-                                self.handle_get_slru_segment_request(
-                                    tenant_id,
-                                    timeline_id,
-                                    &req,
-                                    &ctx,
-                                )
-                                .instrument(span.clone())
-                                .await,
-                            ],
-                            span,
-                        )
-                    }
-                    BatchedFeMessage::RespondError { span, error } => {
-                        // We've already decided to respond with an error, so we don't need to
-                        // call the handler.
-                        (vec![Err(error)], span)
-                    }
-                };
+    /// # Cancel-Safety
+    ///
+    /// May leak tokio tasks if not polled to completion.
+    #[allow(clippy::too_many_arguments)]
+    async fn handle_pagerequests_pipelined<IO>(
+        &mut self,
+        pgb_writer: &mut PostgresBackend<IO>,
+        pgb_reader: PostgresBackendReader<IO>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        mut timeline_handles: TimelineHandles,
+        request_span: Span,
+        pipelining_config: PageServicePipeliningConfigPipelined,
+        ctx: &RequestContext,
+    ) -> (
+        (PostgresBackendReader<IO>, TimelineHandles),
+        Result<(), QueryError>,
+    )
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
+    {
+        //
+        // Pipelined pagestream handling consists of
+        // - a Batcher that reads requests off the wire and
+        //   and batches them if possible,
+        // - an Executor that processes the batched requests.
+        //
+        // The batch is built up inside an `spsc_fold` channel,
+        // shared betwen Batcher (Sender) and Executor (Receiver).
+        //
+        // The Batcher continously folds client requests into the batch,
+        // while the Executor can at any time take out what's in the batch
+        // in order to process it.
+        // This means the next batch builds up while the Executor
+        // executes the last batch.
+        //
+        // CANCELLATION
+        //
+        // We run both Batcher and Executor futures to completion before
+        // returning from this function.
+        //
+        // If Executor exits first, it signals cancellation to the Batcher
+        // via a CancellationToken that is child of `self.cancel`.
+        // If Batcher exits first, it signals cancellation to the Executor
+        // by dropping the spsc_fold channel Sender.
+        //
+        // CLEAN SHUTDOWN
+        //
+        // Clean shutdown means that the client ends the COPYBOTH session.
+        // In response to such a client message, the Batcher exits.
+        // The Executor continues to run, draining the spsc_fold channel.
+        // Once drained, the spsc_fold recv will fail with a distinct error
+        // indicating that the sender disconnected.
+        // The Executor exits with Ok(()) in response to that error.
+        //
+        // Server initiated shutdown is not clean shutdown, but instead
+        // is an error Err(QueryError::Shutdown) that is propagated through
+        // error propagation.
+        //
+        // ERROR PROPAGATION
+        //
+        // When the Batcher encounter an error, it sends it as a value
+        // through the spsc_fold channel and exits afterwards.
+        // When the Executor observes such an error in the channel,
+        // it exits returning that error value.
+        //
+        // This design ensures that the Executor stage will still process
+        // the batch that was in flight when the Batcher encountered an error,
+        // thereby beahving identical to a serial implementation.
 
-                // Map handler result to protocol behavior.
-                // Some handler errors cause exit from pagestream protocol.
-                // Other handler errors are sent back as an error message and we stay in pagestream protocol.
-                for handler_result in handler_results {
-                    let response_msg = match handler_result {
-                        Err(e) => match &e {
-                            PageStreamError::Shutdown => {
-                                // If we fail to fulfil a request during shutdown, which may be _because_ of
-                                // shutdown, then do not send the error to the client.  Instead just drop the
-                                // connection.
-                                span.in_scope(|| info!("dropping connection due to shutdown"));
-                                return Err(QueryError::Shutdown);
-                            }
-                            PageStreamError::Reconnect(reason) => {
-                                span.in_scope(|| info!("handler requested reconnect: {reason}"));
-                                return Err(QueryError::Reconnect);
-                            }
-                            PageStreamError::Read(_)
-                            | PageStreamError::LsnTimeout(_)
-                            | PageStreamError::NotFound(_)
-                            | PageStreamError::BadRequest(_) => {
-                                // print the all details to the log with {:#}, but for the client the
-                                // error message is enough.  Do not log if shutting down, as the anyhow::Error
-                                // here includes cancellation which is not an error.
-                                let full = utils::error::report_compact_sources(&e);
-                                span.in_scope(|| {
-                                    error!("error reading relation or page version: {full:#}")
-                                });
-                                PagestreamBeMessage::Error(PagestreamErrorResponse {
-                                    message: e.to_string(),
-                                })
-                            }
-                        },
-                        Ok(response_msg) => response_msg,
-                    };
+        let PageServicePipeliningConfigPipelined {
+            max_batch_size,
+            execution,
+        } = pipelining_config;
 
-                    // marshal & transmit response message
-                    pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
+        // Macro to _define_ a pipeline stage.
+        macro_rules! pipeline_stage {
+            ($name:literal, $cancel:expr, $make_fut:expr) => {{
+                let cancel: CancellationToken = $cancel;
+                let stage_fut = $make_fut(cancel.clone());
+                async move {
+                    scopeguard::defer! {
+                        debug!("exiting");
+                    }
+                    timed_after_cancellation(stage_fut, $name, Duration::from_millis(100), &cancel)
+                        .await
                 }
-                tokio::select! {
-                    biased;
-                    _ = self.cancel.cancelled() => {
-                        // We were requested to shut down.
-                        info!("shutdown request received in page handler");
-                        return Err(QueryError::Shutdown)
-                    }
-                    res = pgb.flush() => {
-                        res?;
-                    }
+                .instrument(tracing::info_span!($name))
+            }};
+        }
+
+        //
+        // Batcher
+        //
+
+        let cancel_batcher = self.cancel.child_token();
+        let (mut batch_tx, mut batch_rx) = spsc_fold::channel();
+        let batcher = pipeline_stage!("batcher", cancel_batcher.clone(), move |cancel_batcher| {
+            let ctx = ctx.attached_child();
+            async move {
+                let mut pgb_reader = pgb_reader;
+                let mut exit = false;
+                while !exit {
+                    let read_res = Self::pagestream_read_message(
+                        &mut pgb_reader,
+                        tenant_id,
+                        timeline_id,
+                        &mut timeline_handles,
+                        &cancel_batcher,
+                        &ctx,
+                        request_span.clone(),
+                    )
+                    .await;
+                    let Some(read_res) = read_res.transpose() else {
+                        debug!("client-initiated shutdown");
+                        break;
+                    };
+                    exit |= read_res.is_err();
+                    let could_send = batch_tx
+                        .send(read_res, |batch, res| {
+                            Self::pagestream_do_batch(max_batch_size, batch, res)
+                        })
+                        .await;
+                    exit |= could_send.is_err();
+                }
+                (pgb_reader, timeline_handles)
+            }
+        });
+
+        //
+        // Executor
+        //
+
+        let executor = pipeline_stage!("executor", self.cancel.clone(), move |cancel| {
+            let ctx = ctx.attached_child();
+            async move {
+                let _cancel_batcher = cancel_batcher.drop_guard();
+                loop {
+                    let maybe_batch = batch_rx.recv().await;
+                    let batch = match maybe_batch {
+                        Ok(batch) => batch,
+                        Err(spsc_fold::RecvError::SenderGone) => {
+                            debug!("upstream gone");
+                            return Ok(());
+                        }
+                    };
+                    let batch = match batch {
+                        Ok(batch) => batch,
+                        Err(e) => {
+                            return Err(e);
+                        }
+                    };
+                    self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
+                        .await?;
                 }
             }
+        });
+
+        //
+        // Execute the stages.
+        //
+
+        match execution {
+            PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures => {
+                tokio::join!(batcher, executor)
+            }
+            PageServiceProtocolPipelinedExecutionStrategy::Tasks => {
+                // These tasks are not tracked anywhere.
+                let read_messages_task = tokio::spawn(batcher);
+                let (read_messages_task_res, executor_res_) =
+                    tokio::join!(read_messages_task, executor,);
+                (
+                    read_messages_task_res.expect("propagated panic from read_messages"),
+                    executor_res_,
+                )
+            }
         }
-        Ok(())
     }
 
     /// Helper function to handle the LSN from client request.
@@ -1131,6 +1383,8 @@ impl PageServerHandler {
     {
         let timeline = self
             .timeline_handles
+            .as_mut()
+            .unwrap()
             .get(
                 tenant_shard_id.tenant_id,
                 timeline_id,
@@ -1165,22 +1419,17 @@ impl PageServerHandler {
     #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_rel_exists_request(
         &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
+        timeline: &Timeline,
         req: &PagestreamExistsRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self
-            .timeline_handles
-            .get(tenant_id, timeline_id, ShardSelector::Zero)
-            .await?;
         let _timer = timeline
             .query_metrics
             .start_timer(metrics::SmgrQueryType::GetRelExists, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
-            &timeline,
+            timeline,
             req.request_lsn,
             req.not_modified_since,
             &latest_gc_cutoff_lsn,
@@ -1200,23 +1449,17 @@ impl PageServerHandler {
     #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_nblocks_request(
         &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
+        timeline: &Timeline,
         req: &PagestreamNblocksRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self
-            .timeline_handles
-            .get(tenant_id, timeline_id, ShardSelector::Zero)
-            .await?;
-
         let _timer = timeline
             .query_metrics
             .start_timer(metrics::SmgrQueryType::GetRelSize, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
-            &timeline,
+            timeline,
             req.request_lsn,
             req.not_modified_since,
             &latest_gc_cutoff_lsn,
@@ -1236,23 +1479,17 @@ impl PageServerHandler {
     #[instrument(skip_all, fields(shard_id))]
     async fn handle_db_size_request(
         &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
+        timeline: &Timeline,
         req: &PagestreamDbSizeRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self
-            .timeline_handles
-            .get(tenant_id, timeline_id, ShardSelector::Zero)
-            .await?;
-
         let _timer = timeline
             .query_metrics
             .start_timer(metrics::SmgrQueryType::GetDbSize, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
-            &timeline,
+            timeline,
             req.request_lsn,
             req.not_modified_since,
             &latest_gc_cutoff_lsn,
@@ -1300,23 +1537,17 @@ impl PageServerHandler {
     #[instrument(skip_all, fields(shard_id))]
     async fn handle_get_slru_segment_request(
         &mut self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
+        timeline: &Timeline,
         req: &PagestreamGetSlruSegmentRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let timeline = self
-            .timeline_handles
-            .get(tenant_id, timeline_id, ShardSelector::Zero)
-            .await?;
-
         let _timer = timeline
             .query_metrics
             .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);
 
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
-            &timeline,
+            timeline,
             req.request_lsn,
             req.not_modified_since,
             &latest_gc_cutoff_lsn,
@@ -1374,6 +1605,8 @@ impl PageServerHandler {
 
         let timeline = self
             .timeline_handles
+            .as_mut()
+            .unwrap()
             .get(tenant_id, timeline_id, ShardSelector::Zero)
             .await?;
 
@@ -1716,7 +1949,7 @@ impl PageServiceCmd {
 
 impl<IO> postgres_backend::Handler<IO> for PageServerHandler
 where
-    IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
 {
     fn check_auth_jwt(
         &mut self,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e3c88e9965..9bcfffeb9c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3804,9 +3804,10 @@ class Endpoint(PgProtocol, LogUtils):
             # shared_buffers = 512kB to make postgres use LFC intensively
             # neon.max_file_cache_size and neon.file_cache size limit are
             # set to 1MB because small LFC is better for testing (helps to find more problems)
+            lfc_path_escaped = str(lfc_path).replace("'", "''")
             config_lines = [
                 "shared_buffers = 512kB",
-                f"neon.file_cache_path = '{self.lfc_path()}'",
+                f"neon.file_cache_path = '{lfc_path_escaped}'",
                 "neon.max_file_cache_size = 1MB",
                 "neon.file_cache_size_limit = 1MB",
             ] + config_lines
diff --git a/test_runner/performance/pageserver/test_pageserver_getpage_merge.py b/test_runner/performance/pageserver/test_page_service_batching.py
similarity index 69%
rename from test_runner/performance/pageserver/test_pageserver_getpage_merge.py
rename to test_runner/performance/pageserver/test_page_service_batching.py
index 34cce9900b..c47a849fec 100644
--- a/test_runner/performance/pageserver/test_pageserver_getpage_merge.py
+++ b/test_runner/performance/pageserver/test_page_service_batching.py
@@ -11,36 +11,95 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, PgBin, wait_for_last_flush_lsn
 from fixtures.utils import humantime_to_ms
 
-TARGET_RUNTIME = 60
+TARGET_RUNTIME = 30
+
+
+@dataclass
+class PageServicePipeliningConfig:
+    pass
+
+
+@dataclass
+class PageServicePipeliningConfigSerial(PageServicePipeliningConfig):
+    mode: str = "serial"
+
+
+@dataclass
+class PageServicePipeliningConfigPipelined(PageServicePipeliningConfig):
+    max_batch_size: int
+    execution: str
+    mode: str = "pipelined"
+
+
+EXECUTION = ["concurrent-futures", "tasks"]
+
+NON_BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
+for max_batch_size in [1, 32]:
+    for execution in EXECUTION:
+        NON_BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
+
+BATCHABLE: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
+for max_batch_size in [1, 2, 4, 8, 16, 32]:
+    for execution in EXECUTION:
+        BATCHABLE.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
 
 
-@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/9820#issue-2675856095")
 @pytest.mark.parametrize(
-    "tablesize_mib, batch_timeout, target_runtime, effective_io_concurrency, readhead_buffer_size, name",
+    "tablesize_mib, pipelining_config, target_runtime, effective_io_concurrency, readhead_buffer_size, name",
     [
-        # the next 4 cases demonstrate how not-batchable workloads suffer from batching timeout
-        (50, None, TARGET_RUNTIME, 1, 128, "not batchable no batching"),
-        (50, "10us", TARGET_RUNTIME, 1, 128, "not batchable 10us timeout"),
-        (50, "1ms", TARGET_RUNTIME, 1, 128, "not batchable 1ms timeout"),
-        # the next 4 cases demonstrate how batchable workloads benefit from batching
-        (50, None, TARGET_RUNTIME, 100, 128, "batchable no batching"),
-        (50, "10us", TARGET_RUNTIME, 100, 128, "batchable 10us timeout"),
-        (50, "100us", TARGET_RUNTIME, 100, 128, "batchable 100us timeout"),
-        (50, "1ms", TARGET_RUNTIME, 100, 128, "batchable 1ms timeout"),
+        # non-batchable workloads
+        # (A separate benchmark will consider latency).
+        *[
+            (
+                50,
+                config,
+                TARGET_RUNTIME,
+                1,
+                128,
+                f"not batchable {dataclasses.asdict(config)}",
+            )
+            for config in NON_BATCHABLE
+        ],
+        # batchable workloads should show throughput and CPU efficiency improvements
+        *[
+            (
+                50,
+                config,
+                TARGET_RUNTIME,
+                100,
+                128,
+                f"batchable {dataclasses.asdict(config)}",
+            )
+            for config in BATCHABLE
+        ],
     ],
 )
-def test_getpage_merge_smoke(
+def test_throughput(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,
     tablesize_mib: int,
-    batch_timeout: str | None,
+    pipelining_config: PageServicePipeliningConfig,
     target_runtime: int,
     effective_io_concurrency: int,
     readhead_buffer_size: int,
     name: str,
 ):
     """
-    Do a bunch of sequential scans and ensure that the pageserver does some merging.
+    Do a bunch of sequential scans with varying compute and pipelining configurations.
+    Primary performance metrics are the achieved batching factor and throughput (wall clock time).
+    Resource utilization is also interesting - we currently measure CPU time.
+
+    The test is a fixed-runtime based type of test (target_runtime).
+    Hence, the results are normalized to the number of iterations completed within target runtime.
+
+    If the compute doesn't provide pipeline depth (effective_io_concurrency=1),
+    performance should be about identical in all configurations.
+    Pipelining can still yield improvements in these scenarios because it parses the
+    next request while the current one is still being executed.
+
+    If the compute provides pipeline depth (effective_io_concurrency=100), then
+    pipelining configs, especially with max_batch_size>1 should yield dramatic improvements
+    in all performance metrics.
     """
 
     #
@@ -51,14 +110,16 @@ def test_getpage_merge_smoke(
     params.update(
         {
             "tablesize_mib": (tablesize_mib, {"unit": "MiB"}),
-            "batch_timeout": (
-                -1 if batch_timeout is None else 1e3 * humantime_to_ms(batch_timeout),
-                {"unit": "us"},
-            ),
             # target_runtime is just a polite ask to the workload to run for this long
             "effective_io_concurrency": (effective_io_concurrency, {}),
             "readhead_buffer_size": (readhead_buffer_size, {}),
-            # name is not a metric
+            # name is not a metric, we just use it to identify the test easily in the `test_...[...]`` notation
+        }
+    )
+    params.update(
+        {
+            f"pipelining_config.{k}": (v, {})
+            for k, v in dataclasses.asdict(pipelining_config).items()
         }
     )
 
@@ -170,7 +231,9 @@ def test_getpage_merge_smoke(
         after = get_metrics()
         return (after - before).normalize(iters - 1)
 
-    env.pageserver.patch_config_toml_nonrecursive({"server_side_batch_timeout": batch_timeout})
+    env.pageserver.patch_config_toml_nonrecursive(
+        {"page_service_pipelining": dataclasses.asdict(pipelining_config)}
+    )
     env.pageserver.restart()
     metrics = workload()
 
@@ -199,23 +262,30 @@ def test_getpage_merge_smoke(
     )
 
 
-@pytest.mark.skip("See https://github.com/neondatabase/neon/pull/9820#issue-2675856095")
+PRECISION_CONFIGS: list[PageServicePipeliningConfig] = [PageServicePipeliningConfigSerial()]
+for max_batch_size in [1, 32]:
+    for execution in EXECUTION:
+        PRECISION_CONFIGS.append(PageServicePipeliningConfigPipelined(max_batch_size, execution))
+
+
 @pytest.mark.parametrize(
-    "batch_timeout", [None, "10us", "20us", "50us", "100us", "200us", "500us", "1ms"]
+    "pipelining_config,name",
+    [(config, f"{dataclasses.asdict(config)}") for config in PRECISION_CONFIGS],
 )
-def test_timer_precision(
+def test_latency(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,
     pg_bin: PgBin,
-    batch_timeout: str | None,
+    pipelining_config: PageServicePipeliningConfig,
+    name: str,
 ):
     """
-    Determine the batching timeout precision (mean latency) and tail latency impact.
+    Measure the latency impact of pipelining in an un-batchable workloads.
 
-    The baseline is `None`; an ideal batching timeout implementation would increase
-    the mean latency by exactly `batch_timeout`.
+    An ideal implementation should not increase average or tail latencies for such workloads.
 
-    That is not the case with the current implementation, will be addressed in future changes.
+    We don't have support in pagebench to create queue depth yet.
+    => https://github.com/neondatabase/neon/issues/9837
     """
 
     #
@@ -223,7 +293,8 @@ def test_timer_precision(
     #
 
     def patch_ps_config(ps_config):
-        ps_config["server_side_batch_timeout"] = batch_timeout
+        if pipelining_config is not None:
+            ps_config["page_service_pipelining"] = dataclasses.asdict(pipelining_config)
 
     neon_env_builder.pageserver_config_override = patch_ps_config
 

From 8b7e9ed820e5ae4a4da6cd2f88ed89ba47739e4f Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Sat, 30 Nov 2024 11:11:37 +0100
Subject: [PATCH 147/209] Merge the consumption metric pushes (#9939)

#8564

## Problem

The main and backup consumption metric pushes are completely
independent,
resulting in different event time windows and different idempotency
keys.

## Summary of changes

* Merge the push tasks, but keep chunks the same size.
---
 Cargo.lock                          |   1 +
 libs/consumption_metrics/src/lib.rs |   5 +-
 proxy/Cargo.toml                    |   1 +
 proxy/src/bin/proxy.rs              |   4 -
 proxy/src/usage_metrics.rs          | 351 ++++++++++++++--------------
 5 files changed, 181 insertions(+), 181 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 313222cf3c..5ce27a7d45 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4501,6 +4501,7 @@ dependencies = [
  "ecdsa 0.16.9",
  "env_logger",
  "fallible-iterator",
+ "flate2",
  "framed-websockets",
  "futures",
  "hashbrown 0.14.5",
diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs
index fbe2e6830f..448134f31a 100644
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -103,11 +103,12 @@ impl<'a> IdempotencyKey<'a> {
     }
 }
 
+/// Split into chunks of 1000 metrics to avoid exceeding the max request size.
 pub const CHUNK_SIZE: usize = 1000;
 
 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
-#[derive(serde::Serialize, Deserialize)]
-pub struct EventChunk<'a, T: Clone> {
+#[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq)]
+pub struct EventChunk<'a, T: Clone + PartialEq> {
     pub events: std::borrow::Cow<'a, [T]>,
 }
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 0d774d529d..f5934c8a89 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -112,6 +112,7 @@ workspace_hack.workspace = true
 [dev-dependencies]
 camino-tempfile.workspace = true
 fallible-iterator.workspace = true
+flate2.workspace = true
 tokio-tungstenite.workspace = true
 pbkdf2 = { workspace = true, features = ["simple", "std"] }
 rcgen.workspace = true
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index a935378162..b772a987ee 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -517,10 +517,6 @@ async fn main() -> anyhow::Result<()> {
     if let Some(metrics_config) = &config.metric_collection {
         // TODO: Add gc regardles of the metric collection being enabled.
         maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
-        client_tasks.spawn(usage_metrics::task_backup(
-            &metrics_config.backup_metric_collection_config,
-            cancellation_token.clone(),
-        ));
     }
 
     if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend {
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index c5e8588623..65e74466f2 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -1,19 +1,18 @@
 //! Periodically collect proxy consumption metrics
 //! and push them to a HTTP endpoint.
+use std::borrow::Cow;
 use std::convert::Infallible;
-use std::pin::pin;
 use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
 use std::sync::Arc;
 use std::time::Duration;
 
-use anyhow::Context;
+use anyhow::{bail, Context};
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Bytes;
 use chrono::{DateTime, Datelike, Timelike, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use dashmap::mapref::entry::Entry;
 use dashmap::DashMap;
-use futures::future::select;
 use once_cell::sync::Lazy;
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use serde::{Deserialize, Serialize};
@@ -23,7 +22,7 @@ use tracing::{error, info, instrument, trace, warn};
 use utils::backoff;
 use uuid::{NoContext, Timestamp};
 
-use crate::config::{MetricBackupCollectionConfig, MetricCollectionConfig};
+use crate::config::MetricCollectionConfig;
 use crate::context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD};
 use crate::http;
 use crate::intern::{BranchIdInt, EndpointIdInt};
@@ -58,55 +57,21 @@ trait MetricCounterReporter {
     fn move_metrics(&self) -> (u64, usize);
 }
 
-#[derive(Debug)]
-struct MetricBackupCounter {
-    transmitted: AtomicU64,
-    opened_connections: AtomicUsize,
-}
-
-impl MetricCounterRecorder for MetricBackupCounter {
-    fn record_egress(&self, bytes: u64) {
-        self.transmitted.fetch_add(bytes, Ordering::AcqRel);
-    }
-
-    fn record_connection(&self, count: usize) {
-        self.opened_connections.fetch_add(count, Ordering::AcqRel);
-    }
-}
-
-impl MetricCounterReporter for MetricBackupCounter {
-    fn get_metrics(&mut self) -> (u64, usize) {
-        (
-            *self.transmitted.get_mut(),
-            *self.opened_connections.get_mut(),
-        )
-    }
-    fn move_metrics(&self) -> (u64, usize) {
-        (
-            self.transmitted.swap(0, Ordering::AcqRel),
-            self.opened_connections.swap(0, Ordering::AcqRel),
-        )
-    }
-}
-
 #[derive(Debug)]
 pub(crate) struct MetricCounter {
     transmitted: AtomicU64,
     opened_connections: AtomicUsize,
-    backup: Arc<MetricBackupCounter>,
 }
 
 impl MetricCounterRecorder for MetricCounter {
     /// Record that some bytes were sent from the proxy to the client
     fn record_egress(&self, bytes: u64) {
-        self.transmitted.fetch_add(bytes, Ordering::AcqRel);
-        self.backup.record_egress(bytes);
+        self.transmitted.fetch_add(bytes, Ordering::Relaxed);
     }
 
     /// Record that some connections were opened
     fn record_connection(&self, count: usize) {
-        self.opened_connections.fetch_add(count, Ordering::AcqRel);
-        self.backup.record_connection(count);
+        self.opened_connections.fetch_add(count, Ordering::Relaxed);
     }
 }
 
@@ -119,8 +84,8 @@ impl MetricCounterReporter for MetricCounter {
     }
     fn move_metrics(&self) -> (u64, usize) {
         (
-            self.transmitted.swap(0, Ordering::AcqRel),
-            self.opened_connections.swap(0, Ordering::AcqRel),
+            self.transmitted.swap(0, Ordering::Relaxed),
+            self.opened_connections.swap(0, Ordering::Relaxed),
         )
     }
 }
@@ -173,26 +138,11 @@ type FastHasher = std::hash::BuildHasherDefault<rustc_hash::FxHasher>;
 #[derive(Default)]
 pub(crate) struct Metrics {
     endpoints: DashMap<Ids, Arc<MetricCounter>, FastHasher>,
-    backup_endpoints: DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
 }
 
 impl Metrics {
     /// Register a new byte metrics counter for this endpoint
     pub(crate) fn register(&self, ids: Ids) -> Arc<MetricCounter> {
-        let backup = if let Some(entry) = self.backup_endpoints.get(&ids) {
-            entry.clone()
-        } else {
-            self.backup_endpoints
-                .entry(ids.clone())
-                .or_insert_with(|| {
-                    Arc::new(MetricBackupCounter {
-                        transmitted: AtomicU64::new(0),
-                        opened_connections: AtomicUsize::new(0),
-                    })
-                })
-                .clone()
-        };
-
         let entry = if let Some(entry) = self.endpoints.get(&ids) {
             entry.clone()
         } else {
@@ -202,7 +152,6 @@ impl Metrics {
                     Arc::new(MetricCounter {
                         transmitted: AtomicU64::new(0),
                         opened_connections: AtomicUsize::new(0),
-                        backup: backup.clone(),
                     })
                 })
                 .clone()
@@ -227,6 +176,21 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
     );
     let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
 
+    // Even if the remote storage is not configured, we still want to clear the metrics.
+    let storage = if let Some(config) = config
+        .backup_metric_collection_config
+        .remote_storage_config
+        .as_ref()
+    {
+        Some(
+            GenericRemoteStorage::from_config(config)
+                .await
+                .context("remote storage init")?,
+        )
+    } else {
+        None
+    };
+
     let mut prev = Utc::now();
     let mut ticker = tokio::time::interval(config.interval);
     loop {
@@ -237,6 +201,8 @@ pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<Infall
             &USAGE_METRICS.endpoints,
             &http_client,
             &config.endpoint,
+            storage.as_ref(),
+            config.backup_metric_collection_config.chunk_size,
             &hostname,
             prev,
             now,
@@ -283,7 +249,6 @@ fn create_event_chunks<'a>(
     now: DateTime<Utc>,
     chunk_size: usize,
 ) -> impl Iterator<Item = EventChunk<'a, Event<Ids, &'static str>>> + 'a {
-    // Split into chunks of 1000 metrics to avoid exceeding the max request size
     metrics_to_send
         .chunks(chunk_size)
         .map(move |chunk| EventChunk {
@@ -303,11 +268,14 @@ fn create_event_chunks<'a>(
         })
 }
 
+#[expect(clippy::too_many_arguments)]
 #[instrument(skip_all)]
 async fn collect_metrics_iteration(
     endpoints: &DashMap<Ids, Arc<MetricCounter>, FastHasher>,
     client: &http::ClientWithMiddleware,
     metric_collection_endpoint: &reqwest::Url,
+    storage: Option<&GenericRemoteStorage>,
+    outer_chunk_size: usize,
     hostname: &str,
     prev: DateTime<Utc>,
     now: DateTime<Utc>,
@@ -323,17 +291,54 @@ async fn collect_metrics_iteration(
         trace!("no new metrics to send");
     }
 
+    let cancel = CancellationToken::new();
+    let path_prefix = create_remote_path_prefix(now);
+
     // Send metrics.
-    for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, CHUNK_SIZE) {
+    for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, outer_chunk_size) {
+        tokio::join!(
+            upload_main_events_chunked(client, metric_collection_endpoint, &chunk, CHUNK_SIZE),
+            async {
+                if let Err(e) = upload_backup_events(storage, &chunk, &path_prefix, &cancel).await {
+                    error!("failed to upload consumption events to remote storage: {e:?}");
+                }
+            }
+        );
+    }
+}
+
+fn create_remote_path_prefix(now: DateTime<Utc>) -> String {
+    format!(
+        "year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z",
+        year = now.year(),
+        month = now.month(),
+        day = now.day(),
+        hour = now.hour(),
+        minute = now.minute(),
+        second = now.second(),
+    )
+}
+
+async fn upload_main_events_chunked(
+    client: &http::ClientWithMiddleware,
+    metric_collection_endpoint: &reqwest::Url,
+    chunk: &EventChunk<'_, Event<Ids, &str>>,
+    subchunk_size: usize,
+) {
+    // Split into smaller chunks to avoid exceeding the max request size
+    for subchunk in chunk.events.chunks(subchunk_size).map(|c| EventChunk {
+        events: Cow::Borrowed(c),
+    }) {
         let res = client
             .post(metric_collection_endpoint.clone())
-            .json(&chunk)
+            .json(&subchunk)
             .send()
             .await;
 
         let res = match res {
             Ok(x) => x,
             Err(err) => {
+                // TODO: retry?
                 error!("failed to send metrics: {:?}", err);
                 continue;
             }
@@ -341,7 +346,7 @@ async fn collect_metrics_iteration(
 
         if !res.status().is_success() {
             error!("metrics endpoint refused the sent metrics: {:?}", res);
-            for metric in chunk.events.iter().filter(|e| e.value > (1u64 << 40)) {
+            for metric in subchunk.events.iter().filter(|e| e.value > (1u64 << 40)) {
                 // Report if the metric value is suspiciously large
                 warn!("potentially abnormal metric value: {:?}", metric);
             }
@@ -349,113 +354,34 @@ async fn collect_metrics_iteration(
     }
 }
 
-pub async fn task_backup(
-    backup_config: &MetricBackupCollectionConfig,
-    cancellation_token: CancellationToken,
-) -> anyhow::Result<()> {
-    info!("metrics backup config: {backup_config:?}");
-    scopeguard::defer! {
-        info!("metrics backup has shut down");
-    }
-    // Even if the remote storage is not configured, we still want to clear the metrics.
-    let storage = if let Some(config) = backup_config.remote_storage_config.as_ref() {
-        Some(
-            GenericRemoteStorage::from_config(config)
-                .await
-                .context("remote storage init")?,
-        )
-    } else {
-        None
-    };
-    let mut ticker = tokio::time::interval(backup_config.interval);
-    let mut prev = Utc::now();
-    let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
-    loop {
-        select(pin!(ticker.tick()), pin!(cancellation_token.cancelled())).await;
-        let now = Utc::now();
-        collect_metrics_backup_iteration(
-            &USAGE_METRICS.backup_endpoints,
-            storage.as_ref(),
-            &hostname,
-            prev,
-            now,
-            backup_config.chunk_size,
-        )
-        .await;
-
-        prev = now;
-        if cancellation_token.is_cancelled() {
-            info!("metrics backup has been cancelled");
-            break;
-        }
-    }
-    Ok(())
-}
-
-#[instrument(skip_all)]
-async fn collect_metrics_backup_iteration(
-    endpoints: &DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
+async fn upload_backup_events(
     storage: Option<&GenericRemoteStorage>,
-    hostname: &str,
-    prev: DateTime<Utc>,
-    now: DateTime<Utc>,
-    chunk_size: usize,
-) {
-    let year = now.year();
-    let month = now.month();
-    let day = now.day();
-    let hour = now.hour();
-    let minute = now.minute();
-    let second = now.second();
-    let cancel = CancellationToken::new();
-
-    info!("starting collect_metrics_backup_iteration");
-
-    let metrics_to_send = collect_and_clear_metrics(endpoints);
-
-    if metrics_to_send.is_empty() {
-        trace!("no new metrics to send");
-    }
-
-    // Send metrics.
-    for chunk in create_event_chunks(&metrics_to_send, hostname, prev, now, chunk_size) {
-        let real_now = Utc::now();
-        let id = uuid::Uuid::new_v7(Timestamp::from_unix(
-            NoContext,
-            real_now.second().into(),
-            real_now.nanosecond(),
-        ));
-        let path = format!("year={year:04}/month={month:02}/day={day:02}/{hour:02}:{minute:02}:{second:02}Z_{id}.json.gz");
-        let remote_path = match RemotePath::from_string(&path) {
-            Ok(remote_path) => remote_path,
-            Err(e) => {
-                error!("failed to create remote path from str {path}: {:?}", e);
-                continue;
-            }
-        };
-
-        let res = upload_events_chunk(storage, chunk, &remote_path, &cancel).await;
-
-        if let Err(e) = res {
-            error!(
-                "failed to upload consumption events to remote storage: {:?}",
-                e
-            );
-        }
-    }
-}
-
-async fn upload_events_chunk(
-    storage: Option<&GenericRemoteStorage>,
-    chunk: EventChunk<'_, Event<Ids, &'static str>>,
-    remote_path: &RemotePath,
+    chunk: &EventChunk<'_, Event<Ids, &'static str>>,
+    path_prefix: &str,
     cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
     let Some(storage) = storage else {
-        error!("no remote storage configured");
+        warn!("no remote storage configured");
         return Ok(());
     };
-    let data = serde_json::to_vec(&chunk).context("serialize metrics")?;
+
+    let real_now = Utc::now();
+    let id = uuid::Uuid::new_v7(Timestamp::from_unix(
+        NoContext,
+        real_now.second().into(),
+        real_now.nanosecond(),
+    ));
+    let path = format!("{path_prefix}_{id}.json.gz");
+    let remote_path = match RemotePath::from_string(&path) {
+        Ok(remote_path) => remote_path,
+        Err(e) => {
+            bail!("failed to create remote path from str {path}: {:?}", e);
+        }
+    };
+
+    // TODO: This is async compression from Vec to Vec. Rewrite as byte stream.
+    //       Use sync compression in blocking threadpool.
+    let data = serde_json::to_vec(chunk).context("serialize metrics")?;
     let mut encoder = GzipEncoder::new(Vec::new());
     encoder.write_all(&data).await.context("compress metrics")?;
     encoder.shutdown().await.context("compress metrics")?;
@@ -464,7 +390,7 @@ async fn upload_events_chunk(
         || async {
             let stream = futures::stream::once(futures::future::ready(Ok(compressed_data.clone())));
             storage
-                .upload(stream, compressed_data.len(), remote_path, None, cancel)
+                .upload(stream, compressed_data.len(), &remote_path, None, cancel)
                 .await
         },
         TimeoutOrCancel::caused_by_cancel,
@@ -482,9 +408,12 @@ async fn upload_events_chunk(
 
 #[cfg(test)]
 mod tests {
+    use std::fs;
+    use std::io::BufReader;
     use std::sync::{Arc, Mutex};
 
     use anyhow::Error;
+    use camino_tempfile::tempdir;
     use chrono::Utc;
     use consumption_metrics::{Event, EventChunk};
     use http_body_util::BodyExt;
@@ -493,6 +422,7 @@ mod tests {
     use hyper::service::service_fn;
     use hyper::{Request, Response};
     use hyper_util::rt::TokioIo;
+    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
     use tokio::net::TcpListener;
     use url::Url;
 
@@ -538,8 +468,34 @@ mod tests {
         let endpoint = Url::parse(&format!("http://{addr}")).unwrap();
         let now = Utc::now();
 
+        let storage_test_dir = tempdir().unwrap();
+        let local_fs_path = storage_test_dir.path().join("usage_metrics");
+        fs::create_dir_all(&local_fs_path).unwrap();
+        let storage = GenericRemoteStorage::from_config(&RemoteStorageConfig {
+            storage: RemoteStorageKind::LocalFs {
+                local_path: local_fs_path.clone(),
+            },
+            timeout: Duration::from_secs(10),
+            small_timeout: Duration::from_secs(1),
+        })
+        .await
+        .unwrap();
+
+        let mut pushed_chunks: Vec<Report> = Vec::new();
+        let mut stored_chunks: Vec<Report> = Vec::new();
+
         // no counters have been registered
-        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(
+            &metrics.endpoints,
+            &client,
+            &endpoint,
+            Some(&storage),
+            1000,
+            "foo",
+            now,
+            now,
+        )
+        .await;
         let r = std::mem::take(&mut *reports.lock().unwrap());
         assert!(r.is_empty());
 
@@ -551,39 +507,84 @@ mod tests {
         });
 
         // the counter should be observed despite 0 egress
-        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(
+            &metrics.endpoints,
+            &client,
+            &endpoint,
+            Some(&storage),
+            1000,
+            "foo",
+            now,
+            now,
+        )
+        .await;
         let r = std::mem::take(&mut *reports.lock().unwrap());
         assert_eq!(r.len(), 1);
         assert_eq!(r[0].events.len(), 1);
         assert_eq!(r[0].events[0].value, 0);
+        pushed_chunks.extend(r);
 
         // record egress
         counter.record_egress(1);
 
         // egress should be observered
-        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(
+            &metrics.endpoints,
+            &client,
+            &endpoint,
+            Some(&storage),
+            1000,
+            "foo",
+            now,
+            now,
+        )
+        .await;
         let r = std::mem::take(&mut *reports.lock().unwrap());
         assert_eq!(r.len(), 1);
         assert_eq!(r[0].events.len(), 1);
         assert_eq!(r[0].events[0].value, 1);
+        pushed_chunks.extend(r);
 
         // release counter
         drop(counter);
 
         // we do not observe the counter
-        collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
+        collect_metrics_iteration(
+            &metrics.endpoints,
+            &client,
+            &endpoint,
+            Some(&storage),
+            1000,
+            "foo",
+            now,
+            now,
+        )
+        .await;
         let r = std::mem::take(&mut *reports.lock().unwrap());
         assert!(r.is_empty());
 
         // counter is unregistered
         assert!(metrics.endpoints.is_empty());
 
-        collect_metrics_backup_iteration(&metrics.backup_endpoints, None, "foo", now, now, 1000)
-            .await;
-        assert!(!metrics.backup_endpoints.is_empty());
-        collect_metrics_backup_iteration(&metrics.backup_endpoints, None, "foo", now, now, 1000)
-            .await;
-        // backup counter is unregistered after the second iteration
-        assert!(metrics.backup_endpoints.is_empty());
+        let path_prefix = create_remote_path_prefix(now);
+        for entry in walkdir::WalkDir::new(&local_fs_path)
+            .into_iter()
+            .filter_map(|e| e.ok())
+        {
+            let path = local_fs_path.join(&path_prefix).to_string();
+            if entry.path().to_str().unwrap().starts_with(&path) {
+                let chunk = serde_json::from_reader(flate2::bufread::GzDecoder::new(
+                    BufReader::new(fs::File::open(entry.into_path()).unwrap()),
+                ))
+                .unwrap();
+                stored_chunks.push(chunk);
+            }
+        }
+        storage_test_dir.close().ok();
+
+        // sort by first event's idempotency key because the order of files is nondeterministic
+        pushed_chunks.sort_by_cached_key(|c| c.events[0].idempotency_key.clone());
+        stored_chunks.sort_by_cached_key(|c| c.events[0].idempotency_key.clone());
+        assert_eq!(pushed_chunks, stored_chunks);
     }
 }

From 62b74bdc2c3f8123a14380ef17c7e506479cba83 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Sun, 1 Dec 2024 14:23:10 +0200
Subject: [PATCH 148/209] Add GUC controlling whether to pause recovery if some
 critical GUCs at replica have smaller value than on primary (#9057)

## Problem

See https://github.com/neondatabase/neon/issues/9023

## Summary of changes

Ass GUC `recovery_pause_on_misconfig` allowing not to pause in case of
replica and primary configuration mismatch

See https://github.com/neondatabase/postgres/pull/501
See https://github.com/neondatabase/postgres/pull/502
See https://github.com/neondatabase/postgres/pull/503
See https://github.com/neondatabase/postgres/pull/504


## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pgxn/neon/neon.c                              |  13 ++
 .../regress/test_physical_replication.py      | 221 +++++++++++++++++-
 vendor/postgres-v14                           |   2 +-
 vendor/postgres-v15                           |   2 +-
 vendor/postgres-v16                           |   2 +-
 vendor/postgres-v17                           |   2 +-
 vendor/revisions.json                         |   8 +-
 7 files changed, 241 insertions(+), 9 deletions(-)

diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 51b9f58bbc..ff08f9164d 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -15,6 +15,9 @@
 #include "access/subtrans.h"
 #include "access/twophase.h"
 #include "access/xlog.h"
+#if PG_MAJORVERSION_NUM >= 15
+#include "access/xlogrecovery.h"
+#endif
 #include "replication/logical.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
@@ -432,6 +435,16 @@ _PG_init(void)
 	restore_running_xacts_callback = RestoreRunningXactsFromClog;
 
 
+	DefineCustomBoolVariable(
+							"neon.allow_replica_misconfig",
+							"Allow replica startup when some critical GUCs have smaller value than on primary node",
+							NULL,
+							&allowReplicaMisconfig,
+							true,
+							PGC_POSTMASTER,
+							0,
+							NULL, NULL, NULL);
+
 	DefineCustomEnumVariable(
 							"neon.running_xacts_overflow_policy",
 							"Action performed on snapshot overflow when restoring runnings xacts from CLOG",
diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py
index 043aff686b..6cb11b825d 100644
--- a/test_runner/regress/test_physical_replication.py
+++ b/test_runner/regress/test_physical_replication.py
@@ -4,6 +4,10 @@ import random
 import time
 from typing import TYPE_CHECKING
 
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import wait_replica_caughtup
+
 if TYPE_CHECKING:
     from fixtures.neon_fixtures import NeonEnv
 
@@ -19,8 +23,8 @@ def test_physical_replication(neon_simple_env: NeonEnv):
                 p_cur.execute(
                     "CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))"
                 )
-        time.sleep(1)
         with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary:
+            wait_replica_caughtup(primary, secondary)
             with primary.connect() as p_con:
                 with p_con.cursor() as p_cur:
                     with secondary.connect() as s_con:
@@ -42,3 +46,218 @@ def test_physical_replication(neon_simple_env: NeonEnv):
                                 s_cur.execute(
                                     "select * from t where pk=%s", (random.randrange(1, 2 * pk),)
                                 )
+
+
+def test_physical_replication_config_mismatch_max_connections(neon_simple_env: NeonEnv):
+    """
+    Test for primary and replica with different configuration settings (max_connections).
+    PostgreSQL enforces that settings that affect how many transactions can be open at the same time
+    have values equal to or higher in a hot standby replica than in the primary. If they don't, the replica refuses
+    to start up. If the settings are changed in the primary, it emits a WAL record with the new settings, and
+    when the replica sees that record it pauses the replay.
+
+    PostgreSQL enforces this to ensure that the replica can hold all the XIDs in the so-called
+    "known-assigned XIDs" array, which is a fixed size array that needs to be allocated
+    upfront and server startup. That's pretty pessimistic, though; usually you can get
+    away with smaller settings, because we allocate space for 64 subtransactions per
+    transaction too. If you get unlucky and you run out of space, WAL redo dies with
+    "ERROR: too many KnownAssignedXids". It's better to take the chances than refuse
+    to start up, especially in Neon: if the WAL redo dies, the server is restarted, which is
+    no worse than refusing to start up in the first place. Furthermore, the control plane
+    tries to ensure that on restart, the settings are set high enough, so most likely it will
+    work after restart. Because of that, we have patched Postgres to disable to checks when
+    the `recovery_pause_on_misconfig` setting is set to `false` (which is the default on neon).
+
+    This test tests all those cases of running out of space in known-assigned XIDs array that
+    we can hit with `recovery_pause_on_misconfig=false`, which are unreachable in unpatched
+    Postgres.
+    There's a similar check for `max_locks_per_transactions` too, which is related to running out
+    of space in the lock manager rather than known-assigned XIDs. Similar story with that, although
+    running out of space in the lock manager is possible in unmodified Postgres too. Enforcing the
+    check for `max_locks_per_transactions` ensures  that you don't run out of space in the lock manager
+    when there are no read-only queries holding locks in the replica, but you can still run out if you have
+    those.
+    """
+    env = neon_simple_env
+    with env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+    ) as primary:
+        with primary.connect() as p_con:
+            with p_con.cursor() as p_cur:
+                p_cur.execute(
+                    "CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))"
+                )
+        with env.endpoints.new_replica_start(
+            origin=primary,
+            endpoint_id="secondary",
+            config_lines=["max_connections=5"],
+        ) as secondary:
+            wait_replica_caughtup(primary, secondary)
+            with secondary.connect() as s_con:
+                with s_con.cursor() as s_cur:
+                    cursors = []
+                    for i in range(10):
+                        p_con = primary.connect()
+                        p_cur = p_con.cursor()
+                        p_cur.execute("begin")
+                        p_cur.execute("insert into t (pk) values (%s)", (i,))
+                        cursors.append(p_cur)
+
+                    for p_cur in cursors:
+                        p_cur.execute("commit")
+
+                    wait_replica_caughtup(primary, secondary)
+                    s_cur.execute("select count(*) from t")
+                    assert s_cur.fetchall()[0][0] == 10
+
+
+def test_physical_replication_config_mismatch_max_prepared(neon_simple_env: NeonEnv):
+    """
+    Test for primary and replica with different configuration settings (max_prepared_transactions).
+    If number of transactions at primary exceeds its limit at replica then WAL replay is terminated.
+    """
+    env = neon_simple_env
+    primary = env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+        config_lines=["max_prepared_transactions=10"],
+    )
+    p_con = primary.connect()
+    p_cur = p_con.cursor()
+    p_cur.execute("CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))")
+
+    secondary = env.endpoints.new_replica_start(
+        origin=primary,
+        endpoint_id="secondary",
+        config_lines=["max_prepared_transactions=5"],
+    )
+    wait_replica_caughtup(primary, secondary)
+
+    s_con = secondary.connect()
+    s_cur = s_con.cursor()
+    cursors = []
+    for i in range(10):
+        p_con = primary.connect()
+        p_cur = p_con.cursor()
+        p_cur.execute("begin")
+        p_cur.execute("insert into t (pk) values (%s)", (i,))
+        p_cur.execute(f"prepare transaction 't{i}'")
+        cursors.append(p_cur)
+
+    for i in range(10):
+        cursors[i].execute(f"commit prepared 't{i}'")
+
+    time.sleep(5)
+    with pytest.raises(Exception) as e:
+        s_cur.execute("select count(*) from t")
+        assert s_cur.fetchall()[0][0] == 10
+        secondary.stop()
+
+    log.info(f"Replica crashed with {e}")
+    assert secondary.log_contains("maximum number of prepared transactions reached")
+
+
+def connect(ep):
+    max_reconnect_attempts = 10
+    for _ in range(max_reconnect_attempts):
+        try:
+            return ep.connect()
+        except Exception as e:
+            log.info(f"Failed to connect with primary: {e}")
+            time.sleep(1)
+
+
+def test_physical_replication_config_mismatch_too_many_known_xids(neon_simple_env: NeonEnv):
+    """
+    Test for primary and replica with different configuration settings (max_connections).
+    In this case large difference in this setting and larger number of concurrent transactions at primary
+    # cause too many known xids error  at replica.
+    """
+    env = neon_simple_env
+    primary = env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+        config_lines=[
+            "max_connections=1000",
+            "shared_buffers=128MB",  # prevent "no unpinned buffers available" error
+        ],
+    )
+    secondary = env.endpoints.new_replica_start(
+        origin=primary,
+        endpoint_id="secondary",
+        config_lines=[
+            "max_connections=2",
+            "autovacuum_max_workers=1",
+            "max_worker_processes=5",
+            "max_wal_senders=1",
+            "superuser_reserved_connections=0",
+        ],
+    )
+
+    p_con = primary.connect()
+    p_cur = p_con.cursor()
+    p_cur.execute("CREATE TABLE t(x integer)")
+
+    n_connections = 990
+    cursors = []
+    for i in range(n_connections):
+        p_con = connect(primary)
+        p_cur = p_con.cursor()
+        p_cur.execute("begin")
+        p_cur.execute(f"insert into t values({i})")
+        cursors.append(p_cur)
+
+    for cur in cursors:
+        cur.execute("commit")
+
+    time.sleep(5)
+    with pytest.raises(Exception) as e:
+        s_con = secondary.connect()
+        s_cur = s_con.cursor()
+        s_cur.execute("select count(*) from t")
+        assert s_cur.fetchall()[0][0] == n_connections
+        secondary.stop()
+
+    log.info(f"Replica crashed with {e}")
+    assert secondary.log_contains("too many KnownAssignedXids")
+
+
+def test_physical_replication_config_mismatch_max_locks_per_transaction(neon_simple_env: NeonEnv):
+    """
+    Test for primary and replica with different configuration settings (max_locks_per_transaction).
+    In  conjunction with different number of max_connections at primary and standby it can cause "out of shared memory"
+    error if the primary obtains more AccessExclusiveLocks than the standby can hold.
+    """
+    env = neon_simple_env
+    primary = env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+        config_lines=[
+            "max_locks_per_transaction = 100",
+        ],
+    )
+    secondary = env.endpoints.new_replica_start(
+        origin=primary,
+        endpoint_id="secondary",
+        config_lines=[
+            "max_connections=10",
+            "max_locks_per_transaction = 10",
+        ],
+    )
+
+    n_tables = 1000
+
+    p_con = primary.connect()
+    p_cur = p_con.cursor()
+    p_cur.execute("begin")
+    for i in range(n_tables):
+        p_cur.execute(f"CREATE TABLE t_{i}(x integer)")
+    p_cur.execute("commit")
+
+    with pytest.raises(Exception) as e:
+        wait_replica_caughtup(primary, secondary)
+        secondary.stop()
+
+    log.info(f"Replica crashed with {e}")
+    assert secondary.log_contains("You might need to increase")
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index c1989c934d..373f9decad 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit c1989c934d46e04e78b3c496c8a34bcd40ddceeb
+Subproject commit 373f9decad933d2d46f321231032ae8b0da81acd
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index d929b9a8b9..972e325e62 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit d929b9a8b9f32f6fe5a0eac3e6e963f0e44e27e6
+Subproject commit 972e325e62b455957adbbdd8580e31275bb5b8c9
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 13e9e35394..dff6615a8e 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 13e9e3539419003e79bd9aa29e1bc44f3fd555dd
+Subproject commit dff6615a8e48a10bb17a03fa3c00635f1ace7a92
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index faebe5e5af..a10d95be67 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit faebe5e5aff5687908504453623778f8515529db
+Subproject commit a10d95be67265e0f10a422ba0457f5a7af01de71
diff --git a/vendor/revisions.json b/vendor/revisions.json
index abeddcadf7..8a73e14dcf 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.2",
-    "faebe5e5aff5687908504453623778f8515529db"
+    "a10d95be67265e0f10a422ba0457f5a7af01de71"
   ],
   "v16": [
     "16.6",
-    "13e9e3539419003e79bd9aa29e1bc44f3fd555dd"
+    "dff6615a8e48a10bb17a03fa3c00635f1ace7a92"
   ],
   "v15": [
     "15.10",
-    "d929b9a8b9f32f6fe5a0eac3e6e963f0e44e27e6"
+    "972e325e62b455957adbbdd8580e31275bb5b8c9"
   ],
   "v14": [
     "14.15",
-    "c1989c934d46e04e78b3c496c8a34bcd40ddceeb"
+    "373f9decad933d2d46f321231032ae8b0da81acd"
   ]
 }

From 030810ed3e44acf2ff9845c4ce57b2f75d01dfcf Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sun, 1 Dec 2024 13:04:37 +0000
Subject: [PATCH 149/209] Compute image: prepare Postgres v14-v16 for Debian 12
 (#9954)

## Problem

Current compute images for Postgres 14-16 don't build on Debian 12
because of issues with extensions.
This PR fixes that, but for the current setup, it is mostly a no-op
change.

## Summary of changes
- Use `/bin/bash -euo pipefail` as SHELL to fail earlier
- Fix `plv8` build: backport a trivial patch for v8
- Fix `postgis` build: depend `sfgal` version on Debian version instead
of Postgres version


Tested in: https://github.com/neondatabase/neon/pull/9849
---
 compute/compute-node.Dockerfile   | 17 ++++++++-----
 compute/patches/plv8-3.1.10.patch | 42 +++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 6 deletions(-)
 create mode 100644 compute/patches/plv8-3.1.10.patch

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 2fcd9985bc..9567018053 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -14,6 +14,9 @@ ARG DEBIAN_FLAVOR=${DEBIAN_VERSION}-slim
 FROM debian:$DEBIAN_FLAVOR AS build-deps
 ARG DEBIAN_VERSION
 
+# Use strict mode for bash to catch errors early
+SHELL ["/bin/bash", "-euo", "pipefail", "-c"]
+
 RUN case $DEBIAN_VERSION in \
       # Version-specific installs for Bullseye (PG14-PG16):
       # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18.
@@ -106,6 +109,7 @@ RUN cd postgres && \
 #
 #########################################################################################
 FROM build-deps AS postgis-build
+ARG DEBIAN_VERSION
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
@@ -122,12 +126,12 @@ RUN apt update && \
 # and also we must check backward compatibility with older versions of PostGIS.
 #
 # Use new version only for v17
-RUN case "${PG_VERSION}" in \
-    "v17") \
+RUN case "${DEBIAN_VERSION}" in \
+    "bookworm") \
         export SFCGAL_VERSION=1.4.1 \
         export SFCGAL_CHECKSUM=1800c8a26241588f11cddcf433049e9b9aea902e923414d2ecef33a3295626c3 \
     ;; \
-    "v14" | "v15" | "v16") \
+    "bullseye") \
         export SFCGAL_VERSION=1.3.10 \
         export SFCGAL_CHECKSUM=4e39b3b2adada6254a7bdba6d297bb28e1a9835a9f879b74f37e2dab70203232 \
     ;; \
@@ -228,6 +232,8 @@ FROM build-deps AS plv8-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+COPY compute/patches/plv8-3.1.10.patch /plv8-3.1.10.patch
+
 RUN apt update && \
     apt install --no-install-recommends -y ninja-build python3-dev libncurses5 binutils clang
 
@@ -239,8 +245,6 @@ RUN apt update && \
 #
 # Use new version only for v17
 # because since v3.2, plv8 doesn't include plcoffee and plls extensions
-ENV PLV8_TAG=v3.2.3
-
 RUN case "${PG_VERSION}" in \
     "v17") \
         export PLV8_TAG=v3.2.3 \
@@ -255,8 +259,9 @@ RUN case "${PG_VERSION}" in \
     git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \
     tar -czf plv8.tar.gz --exclude .git plv8-src && \
     cd plv8-src && \
+    if [[ "${PG_VERSION}" < "v17" ]]; then patch -p1 < /plv8-3.1.10.patch; fi && \
     # generate and copy upgrade scripts
-    mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
+    mkdir -p upgrade && ./generate_upgrade.sh ${PLV8_TAG#v} && \
     cp upgrade/* /usr/local/pgsql/share/extension/ && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
diff --git a/compute/patches/plv8-3.1.10.patch b/compute/patches/plv8-3.1.10.patch
new file mode 100644
index 0000000000..43cdb479f7
--- /dev/null
+++ b/compute/patches/plv8-3.1.10.patch
@@ -0,0 +1,42 @@
+commit 46b38d3e46f9cd6c70d9b189dd6ff4abaa17cf5e
+Author: Alexander Bayandin <alexander@neon.tech>
+Date:   Sat Nov 30 18:29:32 2024 +0000
+
+    Fix v8 9.7.37 compilation on Debian 12
+
+diff --git a/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch
+new file mode 100644
+index 0000000..f0a5dc7
+--- /dev/null
++++ b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch
+@@ -0,0 +1,30 @@
++From 84cf3230a9680aac3b73c410c2b758760b6d3066 Mon Sep 17 00:00:00 2001
++From: Michael Lippautz <mlippautz@chromium.org>
++Date: Thu, 27 Jan 2022 14:14:11 +0100
++Subject: [PATCH] cppgc: Fix include
++
++Add <utility> to cover for std::exchange.
++
++Bug: v8:12585
++Change-Id: Ida65144e93e466be8914527d0e646f348c136bcb
++Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3420309
++Auto-Submit: Michael Lippautz <mlippautz@chromium.org>
++Reviewed-by: Omer Katz <omerkatz@chromium.org>
++Commit-Queue: Michael Lippautz <mlippautz@chromium.org>
++Cr-Commit-Position: refs/heads/main@{#78820}
++---
++ src/heap/cppgc/prefinalizer-handler.h | 1 +
++ 1 file changed, 1 insertion(+)
++
++diff --git a/src/heap/cppgc/prefinalizer-handler.h b/src/heap/cppgc/prefinalizer-handler.h
++index bc17c99b1838..c82c91ff5a45 100644
++--- a/src/heap/cppgc/prefinalizer-handler.h
+++++ b/src/heap/cppgc/prefinalizer-handler.h
++@@ -5,6 +5,7 @@
++ #ifndef V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_
++ #define V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_
++ 
+++#include <utility>
++ #include <vector>
++ 
++ #include "include/cppgc/prefinalizer.h"

From 1e08b5dccc13030f76081d9a6c0cd343c8c29393 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Sun, 1 Dec 2024 17:47:28 +0200
Subject: [PATCH 150/209] Fix issues with prefetch ring buffer resize (#9847)

## Problem

See https://neondb.slack.com/archives/C04DGM6SMTM/p1732110190129479


We observe the following error in the logs
```
[XX000] ERROR: [NEON_SMGR] [shard 3] Incorrect prefetch read: status=1 response=0x7fafef335138 my=128 receive=128
```
most likely caused by changing `neon.readahead_buffer_size`

## Summary of changes

1. Copy shard state
2. Do not use prefetch_set_unused in readahead_buffer_resize
3. Change prefetch buffer overflow criteria

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/pagestore_smgr.c                    | 13 +++++-
 .../regress/test_prefetch_buffer_resize.py    | 40 +++++++++++++++++++
 2 files changed, 51 insertions(+), 2 deletions(-)
 create mode 100644 test_runner/regress/test_prefetch_buffer_resize.py

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index cbb0e2ae6d..a5e0c402fb 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -439,6 +439,8 @@ readahead_buffer_resize(int newsize, void *extra)
 	newPState->ring_unused = newsize;
 	newPState->ring_receive = newsize;
 	newPState->ring_flush = newsize;
+	newPState->max_shard_no = MyPState->max_shard_no;
+	memcpy(newPState->shard_bitmap, MyPState->shard_bitmap, sizeof(MyPState->shard_bitmap));
 
 	/*
 	 * Copy over the prefetches.
@@ -495,7 +497,11 @@ readahead_buffer_resize(int newsize, void *extra)
 
 	for (; end >= MyPState->ring_last && end != UINT64_MAX; end -= 1)
 	{
-		prefetch_set_unused(end);
+		PrefetchRequest *slot = GetPrfSlot(end);
+		if (slot->status == PRFS_RECEIVED)
+		{
+			pfree(slot->response);
+		}
 	}
 
 	prfh_destroy(MyPState->prf_hash);
@@ -944,6 +950,9 @@ Retry:
 		Assert(entry == NULL);
 		Assert(slot == NULL);
 
+		/* There should be no buffer overflow */
+		Assert(MyPState->ring_last + readahead_buffer_size >= MyPState->ring_unused);
+
 		/*
 		 * If the prefetch queue is full, we need to make room by clearing the
 		 * oldest slot. If the oldest slot holds a buffer that was already
@@ -958,7 +967,7 @@ Retry:
 		 * a prefetch request kind of goes against the principles of
 		 * prefetching)
 		 */
-		if (MyPState->ring_last + readahead_buffer_size - 1 == MyPState->ring_unused)
+		if (MyPState->ring_last + readahead_buffer_size == MyPState->ring_unused)
 		{
 			uint64		cleanup_index = MyPState->ring_last;
 
diff --git a/test_runner/regress/test_prefetch_buffer_resize.py b/test_runner/regress/test_prefetch_buffer_resize.py
new file mode 100644
index 0000000000..7676b78b0e
--- /dev/null
+++ b/test_runner/regress/test_prefetch_buffer_resize.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+import random
+
+import pytest
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+@pytest.mark.parametrize("shard_count", [None, 4])
+@pytest.mark.timeout(600)
+def test_prefetch(neon_env_builder: NeonEnvBuilder, shard_count: int | None):
+    if shard_count is not None:
+        neon_env_builder.num_pageservers = shard_count
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shard_count,
+    )
+    n_iter = 10
+    n_rec = 100000
+
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            "shared_buffers=10MB",
+        ],
+    )
+
+    cur = endpoint.connect().cursor()
+
+    cur.execute("CREATE TABLE t(pk integer, filler text default repeat('?', 200))")
+    cur.execute(f"insert into t (pk) values (generate_series(1,{n_rec}))")
+
+    cur.execute("set statement_timeout=0")
+    cur.execute("set effective_io_concurrency=20")
+    cur.execute("set max_parallel_workers_per_gather=0")
+
+    for _ in range(n_iter):
+        buf_size = random.randrange(16, 32)
+        cur.execute(f"set neon.readahead_buffer_size={buf_size}")
+        limit = random.randrange(1, n_rec)
+        cur.execute(f"select sum(pk) from (select pk from t limit {limit}) s")

From fdf231c237c1929f1b11c893dc9666e4c89e1722 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Sun, 1 Dec 2024 18:09:58 +0000
Subject: [PATCH 151/209] storcon: don't take any Service locks in /status and
 /ready (#9944)

## Problem

We saw unexpected container terminations when running in k8s with with
small CPU resource requests.

The /status and /ready handlers called `maybe_forward`, which always
takes the lock on Service::inner.

If there is a lot of writer lock contention, and the container is
starved of CPU, this increases the likelihood that we will get killed by
the kubelet.

It isn't certain that this was a cause of issues, but it is a potential
source that we can eliminate.

## Summary of changes

- Revise logic to return immediately if the URL is in the non-forwarded
list, rather than calling maybe_forward
---
 storage_controller/src/http.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 9b5d4caf31..39e078ba7c 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1452,10 +1452,15 @@ async fn maybe_forward(req: Request<Body>) -> ForwardOutcome {
     let uri = req.uri().to_string();
     let uri_for_forward = !NOT_FOR_FORWARD.contains(&uri.as_str());
 
+    // Fast return before trying to take any Service locks, if we will never forward anyway
+    if !uri_for_forward {
+        return ForwardOutcome::NotForwarded(req);
+    }
+
     let state = get_state(&req);
     let leadership_status = state.service.get_leadership_status();
 
-    if leadership_status != LeadershipStatus::SteppedDown || !uri_for_forward {
+    if leadership_status != LeadershipStatus::SteppedDown {
         return ForwardOutcome::NotForwarded(req);
     }
 

From 7fc2912d06a79e70091bdd51d422835848048be3 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Mon, 2 Dec 2024 10:10:51 +0000
Subject: [PATCH 152/209] Update pgvector to 0.8.0 (#9733)

---
 compute/compute-node.Dockerfile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 9567018053..222a0cb88b 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -358,10 +358,10 @@ COPY compute/patches/pgvector.patch /pgvector.patch
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
 #
-# vector 0.7.4 supports v17
-# last release v0.7.4 - Aug 5, 2024
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.4.tar.gz -O pgvector.tar.gz && \
-    echo "0341edf89b1924ae0d552f617e14fb7f8867c0194ed775bcc44fa40288642583 pgvector.tar.gz" | sha256sum --check && \
+# vector >0.7.4 supports v17
+# last release v0.8.0 - Oct 30, 2024
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O pgvector.tar.gz && \
+    echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
     patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \

From 0e1a33660754bb216cd954e61ff90085a64e2d24 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 2 Dec 2024 12:26:15 +0200
Subject: [PATCH 153/209] test_runner: improve `wait_until` (#9936)

Improves `wait_until` by:

* Use `timeout` instead of `iterations`. This allows changing the
timeout/interval parameters independently.
* Make `timeout` and `interval` optional (default 20s and 0.5s). Most
callers don't care.
* Only output status every 1s by default, and add optional
`status_interval` parameter.
* Remove `show_intermediate_error`, this was always emitted anyway.

Most callers have been updated to use the defaults, except where they
had good reason otherwise.
---
 test_runner/fixtures/neon_fixtures.py         | 14 ++--
 test_runner/fixtures/pageserver/utils.py      | 17 +----
 test_runner/fixtures/safekeeper/http.py       |  2 +-
 test_runner/fixtures/safekeeper/utils.py      |  2 +-
 test_runner/fixtures/utils.py                 | 35 +++++-----
 test_runner/logical_repl/test_clickhouse.py   |  6 +-
 test_runner/logical_repl/test_debezium.py     | 12 +---
 .../performance/test_branch_creation.py       |  8 +--
 .../regress/test_attach_tenant_config.py      |  2 -
 test_runner/regress/test_compaction.py        |  2 +-
 .../regress/test_disk_usage_eviction.py       | 10 +--
 test_runner/regress/test_hot_standby.py       |  6 +-
 .../regress/test_layers_from_future.py        |  2 +-
 test_runner/regress/test_logging.py           |  4 +-
 .../regress/test_logical_replication.py       |  6 +-
 test_runner/regress/test_lsn_mapping.py       |  4 +-
 test_runner/regress/test_neon_superuser.py    |  2 +-
 test_runner/regress/test_ondemand_download.py | 16 ++---
 test_runner/regress/test_pageserver_api.py    | 12 +---
 .../regress/test_pageserver_generations.py    | 14 ++--
 .../test_pageserver_getpage_throttle.py       |  7 +-
 .../regress/test_pageserver_layer_rolling.py  | 16 ++---
 .../regress/test_pageserver_restart.py        |  2 +-
 .../regress/test_pageserver_secondary.py      |  6 +-
 test_runner/regress/test_readonly_node.py     |  2 -
 test_runner/regress/test_remote_storage.py    | 42 ++++++-----
 test_runner/regress/test_replica_start.py     |  2 +-
 test_runner/regress/test_sharding.py          | 18 ++---
 .../regress/test_storage_controller.py        | 70 +++++++++----------
 test_runner/regress/test_storage_scrubber.py  |  2 -
 .../regress/test_subscriber_restart.py        |  2 +-
 test_runner/regress/test_tenant_conf.py       |  6 +-
 test_runner/regress/test_tenant_delete.py     | 14 ++--
 test_runner/regress/test_tenant_detach.py     |  8 +--
 test_runner/regress/test_tenant_relocation.py |  6 +-
 test_runner/regress/test_tenant_size.py       |  4 +-
 test_runner/regress/test_tenant_tasks.py      |  2 +-
 test_runner/regress/test_tenants.py           |  2 +-
 .../test_tenants_with_remote_storage.py       | 12 +---
 test_runner/regress/test_timeline_archive.py  |  6 +-
 test_runner/regress/test_timeline_delete.py   | 61 +++++-----------
 .../regress/test_timeline_detach_ancestor.py  | 30 ++++----
 .../regress/test_timeline_gc_blocking.py      |  2 +-
 test_runner/regress/test_timeline_size.py     | 36 +++++-----
 test_runner/regress/test_wal_acceptor.py      | 22 +++---
 test_runner/regress/test_wal_receiver.py      |  4 +-
 46 files changed, 234 insertions(+), 326 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9bcfffeb9c..5709a3b82b 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1736,7 +1736,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         def storage_controller_ready():
             assert self.ready() is True
 
-        wait_until(30, 1, storage_controller_ready)
+        wait_until(storage_controller_ready)
         return time.time() - t1
 
     def attach_hook_issue(
@@ -2574,7 +2574,7 @@ class NeonPageserver(PgProtocol, LogUtils):
             log.info(f"any_unstable={any_unstable}")
             assert not any_unstable
 
-        wait_until(20, 0.5, complete)
+        wait_until(complete)
 
     def __enter__(self) -> Self:
         return self
@@ -3973,7 +3973,7 @@ class Endpoint(PgProtocol, LogUtils):
                 migration_id: int = cur.fetchall()[0][0]
                 assert migration_id >= num_migrations
 
-            wait_until(20, 0.5, check_migrations_done)
+            wait_until(check_migrations_done)
 
     # Mock the extension part of spec passed from control plane for local testing
     # endpooint.rs adds content of this file as a part of the spec.json
@@ -4489,12 +4489,10 @@ class Safekeeper(LogUtils):
             )
             assert stat.remote_consistent_lsn >= lsn and stat.backup_lsn >= lsn.segment_lsn()
 
-        # xxx: max wait is long because we might be waiting for reconnection from
-        # pageserver to this safekeeper
-        wait_until(30, 1, are_lsns_advanced)
+        wait_until(are_lsns_advanced)
         client.checkpoint(tenant_id, timeline_id)
         if wait_wal_removal:
-            wait_until(30, 1, are_segments_removed)
+            wait_until(are_segments_removed)
 
     def wait_until_paused(self, failpoint: str):
         msg = f"at failpoint {failpoint}"
@@ -4503,7 +4501,7 @@ class Safekeeper(LogUtils):
             log.info(f"waiting for hitting failpoint {failpoint}")
             self.assert_log_contains(msg)
 
-        wait_until(20, 0.5, paused)
+        wait_until(paused)
 
 
 class NeonBroker(LogUtils):
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 46700e3fe3..7c10edc5fc 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -13,7 +13,7 @@ from mypy_boto3_s3.type_defs import (
 from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
-from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage
+from fixtures.remote_storage import RemoteStorage, S3Storage
 from fixtures.utils import wait_until
 
 if TYPE_CHECKING:
@@ -269,12 +269,7 @@ def wait_timeline_detail_404(
     pageserver_http: PageserverHttpClient,
     tenant_id: TenantId | TenantShardId,
     timeline_id: TimelineId,
-    iterations: int,
-    interval: float | None = None,
 ):
-    if interval is None:
-        interval = 0.25
-
     def timeline_is_missing():
         data = {}
         try:
@@ -287,19 +282,17 @@ def wait_timeline_detail_404(
 
         raise RuntimeError(f"Timeline exists state {data.get('state')}")
 
-    wait_until(iterations, interval, func=timeline_is_missing)
+    wait_until(timeline_is_missing)
 
 
 def timeline_delete_wait_completed(
     pageserver_http: PageserverHttpClient,
     tenant_id: TenantId | TenantShardId,
     timeline_id: TimelineId,
-    iterations: int = 20,
-    interval: float | None = None,
     **delete_args,
 ) -> None:
     pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
-    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval)
+    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id)
 
 
 # remote_storage must not be None, but that's easier for callers to make mypy happy
@@ -453,7 +446,3 @@ def many_small_layers_tenant_config() -> dict[str, Any]:
         "checkpoint_distance": 1024**2,
         "image_creation_threshold": 100,
     }
-
-
-def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
-    return 40 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 15
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 094188c0b5..286f80ba69 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -175,7 +175,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
             assert s > Lsn(0)
             return s
 
-        return wait_until(30, 1, timeline_start_lsn_non_zero)
+        return wait_until(timeline_start_lsn_non_zero)
 
     def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
         return self.timeline_status(tenant_id, timeline_id).commit_lsn
diff --git a/test_runner/fixtures/safekeeper/utils.py b/test_runner/fixtures/safekeeper/utils.py
index 0246916470..922cdedccc 100644
--- a/test_runner/fixtures/safekeeper/utils.py
+++ b/test_runner/fixtures/safekeeper/utils.py
@@ -19,4 +19,4 @@ def wait_walreceivers_absent(
         log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
         assert len(status.walreceivers) == 0
 
-    wait_until(30, 0.5, walreceivers_absent)
+    wait_until(walreceivers_absent)
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 04e98fe494..c34ac298d1 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -9,6 +9,7 @@ import tarfile
 import threading
 import time
 from collections.abc import Callable, Iterable
+from datetime import datetime, timedelta
 from hashlib import sha256
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, TypeVar
@@ -380,15 +381,10 @@ def start_in_background(
             if return_code is not None:
                 error = f"expected subprocess to run but it exited with code {return_code}"
             else:
-                attempts = 10
                 try:
-                    wait_until(
-                        number_of_iterations=attempts,
-                        interval=1,
-                        func=is_started,
-                    )
+                    wait_until(is_started, timeout=10)
                 except Exception:
-                    error = f"Failed to get correct status from subprocess in {attempts} attempts"
+                    error = "Failed to get correct status from subprocess"
         except Exception as e:
             error = f"expected subprocess to start but it failed with exception: {e}"
 
@@ -402,28 +398,31 @@ def start_in_background(
 
 
 def wait_until(
-    number_of_iterations: int,
-    interval: float,
     func: Callable[[], WaitUntilRet],
-    show_intermediate_error: bool = False,
+    name: str | None = None,
+    timeout: float = 20.0,  # seconds
+    interval: float = 0.5,  # seconds
+    status_interval: float = 1.0,  # seconds
 ) -> WaitUntilRet:
     """
     Wait until 'func' returns successfully, without exception. Returns the
     last return value from the function.
     """
+    if name is None:
+        name = getattr(func, "__name__", repr(func))
+    deadline = datetime.now() + timedelta(seconds=timeout)
+    next_status = datetime.now()
     last_exception = None
-    for i in range(number_of_iterations):
+    while datetime.now() <= deadline:
         try:
-            res = func()
+            return func()
         except Exception as e:
-            log.info("waiting for %s iteration %s failed: %s", func, i + 1, e)
+            if datetime.now() >= next_status:
+                log.info("waiting for %s: %s", name, e)
+                next_status = datetime.now() + timedelta(seconds=status_interval)
             last_exception = e
-            if show_intermediate_error:
-                log.info(e)
             time.sleep(interval)
-            continue
-        return res
-    raise Exception(f"timed out while waiting for {func}") from last_exception
+    raise Exception(f"timed out while waiting for {name}") from last_exception
 
 
 def assert_eq(a, b) -> None:
diff --git a/test_runner/logical_repl/test_clickhouse.py b/test_runner/logical_repl/test_clickhouse.py
index 8e03bbe5d4..6b522fa46d 100644
--- a/test_runner/logical_repl/test_clickhouse.py
+++ b/test_runner/logical_repl/test_clickhouse.py
@@ -60,24 +60,22 @@ def test_clickhouse(remote_pg: RemotePostgres):
         "SETTINGS materialized_postgresql_tables_list = 'table1';"
     )
     wait_until(
-        120,
-        0.5,
         lambda: query_clickhouse(
             client,
             "select * from db1_postgres.table1 order by 1",
             "ee600d8f7cd05bd0b169fa81f44300a9dd10085a",
         ),
+        timeout=60,
     )
     cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');")
     conn.commit()
     wait_until(
-        120,
-        0.5,
         lambda: query_clickhouse(
             client,
             "select * from db1_postgres.table1 order by 1",
             "9eba2daaf7e4d7d27ac849525f68b562ab53947d",
         ),
+        timeout=60,
     )
     log.debug("Sleeping before final checking if Neon is still alive")
     time.sleep(3)
diff --git a/test_runner/logical_repl/test_debezium.py b/test_runner/logical_repl/test_debezium.py
index d2cb087c92..8023d64d3d 100644
--- a/test_runner/logical_repl/test_debezium.py
+++ b/test_runner/logical_repl/test_debezium.py
@@ -148,14 +148,12 @@ def test_debezium(debezium):
     )
     conn.commit()
     wait_until(
-        100,
-        0.5,
         lambda: get_kafka_msg(
             consumer,
             ts_ms,
             after={"first_name": "John", "last_name": "Dow", "email": "johndow@example.com"},
         ),
-        show_intermediate_error=True,
+        timeout=60,
     )
     ts_ms = time.time() * 1000
     log.info("Insert 2 ts_ms: %s", ts_ms)
@@ -165,28 +163,24 @@ def test_debezium(debezium):
     )
     conn.commit()
     wait_until(
-        100,
-        0.5,
         lambda: get_kafka_msg(
             consumer,
             ts_ms,
             after={"first_name": "Alex", "last_name": "Row", "email": "alexrow@example.com"},
         ),
-        show_intermediate_error=True,
+        timeout=60,
     )
     ts_ms = time.time() * 1000
     log.info("Update ts_ms: %s", ts_ms)
     cur.execute("update inventory.customers set first_name = 'Alexander' where id = 2")
     conn.commit()
     wait_until(
-        100,
-        0.5,
         lambda: get_kafka_msg(
             consumer,
             ts_ms,
             after={"first_name": "Alexander"},
         ),
-        show_intermediate_error=True,
+        timeout=60,
     )
     time.sleep(3)
     cur.execute("select 1")
diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index c50c4ad432..3ce27d6cd3 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -137,7 +137,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
     startup_line = "INFO version: git(-env)?:"
 
     # find the first line of the log file so we can find the next start later
-    _, first_start = wait_until(5, 1, lambda: env.pageserver.assert_log_contains(startup_line))
+    _, first_start = wait_until(lambda: env.pageserver.assert_log_contains(startup_line))
 
     # start without gc so we can time compaction with less noise; use shorter
     # period for compaction so it starts earlier
@@ -156,7 +156,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
     )
 
     _, second_start = wait_until(
-        5, 1, lambda: env.pageserver.assert_log_contains(startup_line, first_start)
+        lambda: env.pageserver.assert_log_contains(startup_line, first_start),
     )
     env.pageserver.quiesce_tenants()
 
@@ -164,8 +164,6 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
 
     # wait for compaction to complete, which most likely has already done so multiple times
     msg, _ = wait_until(
-        30,
-        1,
         lambda: env.pageserver.assert_log_contains(
             f".*tenant_id={env.initial_tenant}.*: compaction iteration complete.*", second_start
         ),
@@ -205,7 +203,7 @@ def wait_and_record_startup_metrics(
         assert len(matching) == len(expected_labels)
         return matching
 
-    samples = wait_until(10, 1, metrics_are_filled)
+    samples = wait_until(metrics_are_filled)
 
     for sample in samples:
         phase = sample.labels["phase"]
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 670c2698f5..45112fd67e 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -64,8 +64,6 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N
     )
 
     wait_until(
-        50,
-        0.1,
         lambda: env.pageserver.assert_log_contains(".*Error processing HTTP request: Bad request"),
     )
 
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 302a8fd0d1..b6741aed68 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -385,7 +385,7 @@ def test_pageserver_compaction_circuit_breaker(neon_env_builder: NeonEnvBuilder)
 
     # Wait for enough failures to break the circuit breaker
     # This wait is fairly long because we back off on compaction failures, so 5 retries takes ~30s
-    wait_until(60, 1, assert_broken)
+    wait_until(assert_broken, timeout=60)
 
     # Sleep for a while, during which time we expect that compaction will _not_ be retried
     time.sleep(10)
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 1807511008..05956b5b93 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -211,7 +211,7 @@ class EvictionEnv:
             pageserver.assert_log_contains(".*running mocked statvfs.*")
 
         # we most likely have already completed multiple runs
-        wait_until(10, 1, statvfs_called)
+        wait_until(statvfs_called)
 
 
 def count_layers_per_tenant(
@@ -772,14 +772,14 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
     )
 
     wait_until(
-        10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
+        lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
     )
 
     def less_than_max_usage_pct():
         post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
         assert post_eviction_total_size < 0.33 * total_size, "we requested max 33% usage"
 
-    wait_until(2, 2, less_than_max_usage_pct)
+    wait_until(less_than_max_usage_pct, timeout=5)
 
     # Disk usage candidate collection only takes into account active tenants.
     # However, the statvfs call takes into account the entire tenants directory,
@@ -825,7 +825,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
     )
 
     wait_until(
-        10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
+        lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved"),
     )
 
     def more_than_min_avail_bytes_freed():
@@ -834,7 +834,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
             total_size - post_eviction_total_size >= min_avail_bytes
         ), f"we requested at least {min_avail_bytes} worth of free space"
 
-    wait_until(2, 2, more_than_min_avail_bytes_freed)
+    wait_until(more_than_min_avail_bytes_freed, timeout=5)
 
 
 def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 0b1ac11c16..4044f25b37 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -257,7 +257,7 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             # Wait until we see that the pgbench_accounts is created + filled on replica *and*
             # index is created. Otherwise index creation would conflict with
             # read queries and hs feedback won't save us.
-            wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary))
+            wait_until(partial(pgbench_accounts_initialized, secondary), timeout=60)
 
             # Test should fail if hs feedback is disabled anyway, but cross
             # check that walproposer sets some xmin.
@@ -269,7 +269,7 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
                 log.info(f"xmin is {slot_xmin}")
                 assert int(slot_xmin) > 0
 
-            wait_until(10, 1.0, xmin_is_not_null)
+            wait_until(xmin_is_not_null)
             for _ in range(1, 5):
                 # in debug mode takes about 5-7s
                 balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts")
@@ -286,7 +286,7 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             log.info(f"xmin is {slot_xmin}")
             assert slot_xmin is None
 
-        wait_until(10, 1.0, xmin_is_null)
+        wait_until(xmin_is_null)
 
 
 # Test race condition between WAL replay and backends performing queries
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 761ec7568f..8818b40712 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -206,7 +206,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder, attach_mode: str):
         future_layers = set(get_future_layers())
         assert future_layer not in future_layers
 
-    wait_until(10, 0.5, future_layer_is_gone_from_index_part)
+    wait_until(future_layer_is_gone_from_index_part)
 
     # We already make deletion stuck here, but we don't necessarily hit the failpoint
     # because deletions are batched.
diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py
index f6fbdcabfd..d94c786f49 100644
--- a/test_runner/regress/test_logging.py
+++ b/test_runner/regress/test_logging.py
@@ -37,7 +37,7 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
             return
         env.pageserver.assert_log_contains(f".*{msg_id}.*")
 
-    wait_until(10, 0.5, assert_logged)
+    wait_until(assert_logged)
 
     # make sure it's counted
     def assert_metric_value():
@@ -49,4 +49,4 @@ def test_logging_event_count(neon_env_builder: NeonEnvBuilder, level: str):
         log.info("libmetrics_tracing_event_count: %s", val)
         assert val > (before or 0.0)
 
-    wait_until(10, 1, assert_metric_value)
+    wait_until(assert_metric_value)
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index ba471b7147..db18e1758c 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -207,7 +207,7 @@ def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgre
     log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
 
-    wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint))
+    wait_until(partial(slot_removed, endpoint))
 
 
 def test_ondemand_wal_download_in_replication_slot_funcs(neon_env_builder: NeonEnvBuilder):
@@ -519,7 +519,7 @@ def test_replication_shutdown(neon_simple_env: NeonEnv):
             assert len(res) == 4
             assert [r[0] for r in res] == [10, 20, 30, 40]
 
-        wait_until(10, 0.5, check_that_changes_propagated)
+        wait_until(check_that_changes_propagated)
 
 
 def logical_replication_wait_flush_lsn_sync(publisher: PgProtocol) -> Lsn:
@@ -549,7 +549,7 @@ select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication
         )
         assert flush_lsn >= publisher_flush_lsn
 
-    wait_until(30, 0.5, check_caughtup)
+    wait_until(check_caughtup)
     return publisher_flush_lsn
 
 
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 7f0b541128..e42e71646d 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -169,7 +169,7 @@ def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder):
         )
 
         _, offset = wait_until(
-            20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+            lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
         )
 
         with pytest.raises(ReadTimeout):
@@ -178,8 +178,6 @@ def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder):
         client.configure_failpoints((failpoint, "off"))
 
         _, offset = wait_until(
-            20,
-            0.5,
             lambda: env.pageserver.assert_log_contains(
                 "Cancelled request finished with an error: Cancelled$", offset
             ),
diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index 7118127a1f..49cd91906f 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -77,7 +77,7 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
             assert len(res) == 4
             assert [r[0] for r in res] == [10, 20, 30, 40]
 
-        wait_until(10, 0.5, check_that_changes_propagated)
+        wait_until(check_that_changes_propagated)
 
         # Test that pg_monitor is working for neon_superuser role
         cur.execute("SELECT query from pg_stat_activity LIMIT 1")
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index e1caaeb6c1..028d1c2e49 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -256,7 +256,7 @@ def test_ondemand_download_timetravel(neon_env_builder: NeonEnvBuilder):
     ##### Second start, restore the data and ensure it's the same
     env.pageserver.start()
 
-    wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active"))
+    wait_until(lambda: assert_tenant_state(client, tenant_id, "Active"))
 
     # The current_physical_size reports the sum of layers loaded in the layer
     # map, regardless of where the layer files are located. So even though we
@@ -413,7 +413,7 @@ def test_download_remote_layers_api(
         ]
     )
 
-    wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active"))
+    wait_until(lambda: assert_tenant_state(client, tenant_id, "Active"))
 
     ###### Phase 1: exercise download error code path
 
@@ -705,7 +705,7 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
         )
 
         _, offset = wait_until(
-            20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+            lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
         )
 
         location_conf = {"mode": "Detached", "tenant_conf": {}}
@@ -713,8 +713,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
         detach = exec.submit(client.tenant_location_conf, env.initial_tenant, location_conf)
 
         _, offset = wait_until(
-            20,
-            0.5,
             lambda: env.pageserver.assert_log_contains(
                 "closing is taking longer than expected", offset
             ),
@@ -734,8 +732,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
         client.configure_failpoints((failpoint, "pause"))
 
         _, offset = wait_until(
-            20,
-            0.5,
             lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset),
         )
 
@@ -750,8 +746,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
         warmup = exec.submit(client.tenant_secondary_download, env.initial_tenant, wait_ms=30000)
 
         _, offset = wait_until(
-            20,
-            0.5,
             lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}", offset),
         )
 
@@ -805,7 +799,7 @@ def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder):
         )
 
         _, offset = wait_until(
-            20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+            lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
         )
         # ensure enough time while paused to trip the timeout
         time.sleep(2)
@@ -824,8 +818,6 @@ def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder):
 
         # capture the next offset for a new synchronization with the failpoint
         _, offset = wait_until(
-            20,
-            0.5,
             lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset),
         )
 
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 05e81b82e0..55fd7a8608 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -117,19 +117,11 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
         # We need to wait here because it's possible that we don't have access to
         # the latest WAL yet, when the `timeline_detail` API is first called.
         # See: https://github.com/neondatabase/neon/issues/1768.
-        lsn = wait_until(
-            number_of_iterations=5,
-            interval=1,
-            func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None),
-        )
+        lsn = wait_until(lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None))
 
         # Make a DB modification then expect getting a new WAL receiver's data.
         endpoint.safe_psql("INSERT INTO t VALUES (1, 'hey')")
-        wait_until(
-            number_of_iterations=5,
-            interval=1,
-            func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn),
-        )
+        wait_until(lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn))
 
 
 def test_pageserver_http_api_client(neon_simple_env: NeonEnv):
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 6ba5753420..7e5bb45242 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -352,7 +352,7 @@ def test_deletion_queue_recovery(
         def assert_some_validations():
             assert get_deletion_queue_validated(ps_http) > 0
 
-        wait_until(20, 1, assert_some_validations)
+        wait_until(assert_some_validations)
 
         # The validatated keys statistic advances before the header is written, so we
         # also wait to see the header hit the disk: this seems paranoid but the race
@@ -360,7 +360,7 @@ def test_deletion_queue_recovery(
         def assert_header_written():
             assert (main_pageserver.workdir / "deletion" / "header-01").exists()
 
-        wait_until(20, 1, assert_header_written)
+        wait_until(assert_header_written)
 
         # If we will lose attachment, then our expectation on restart is that only the ones
         # we already validated will execute.  Act like only those were present in the queue.
@@ -382,11 +382,11 @@ def test_deletion_queue_recovery(
     # After restart, issue a flush to kick the deletion frontend to do recovery.
     # It should recover all the operations we submitted before the restart.
     ps_http.deletion_queue_flush(execute=False)
-    wait_until(20, 0.25, lambda: assert_deletions_submitted(before_restart_depth))
+    wait_until(lambda: assert_deletions_submitted(before_restart_depth))
 
     # The queue should drain through completely if we flush it
     ps_http.deletion_queue_flush(execute=True)
-    wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
+    wait_until(lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
 
     if keep_attachment == KeepAttachment.KEEP:
         # - If we kept the attachment, then our pre-restart deletions should execute
@@ -564,7 +564,7 @@ def test_multi_attach(
     )
 
     # Initially, the tenant will be attached to the first pageserver (first is default in our test harness)
-    wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active"))
+    wait_until(lambda: assert_tenant_state(http_clients[0], tenant_id, "Active"))
     _detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
     with pytest.raises(PageserverApiException):
         http_clients[1].timeline_detail(tenant_id, timeline_id)
@@ -579,8 +579,8 @@ def test_multi_attach(
     pageservers[1].tenant_attach(env.initial_tenant)
     pageservers[2].tenant_attach(env.initial_tenant)
 
-    wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[1], tenant_id, "Active"))
-    wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[2], tenant_id, "Active"))
+    wait_until(lambda: assert_tenant_state(http_clients[1], tenant_id, "Active"))
+    wait_until(lambda: assert_tenant_state(http_clients[2], tenant_id, "Active"))
 
     # Now they all have it attached
     _details = list([c.timeline_detail(tenant_id, timeline_id) for c in http_clients])
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index f1aad85fe9..ba6a1d9045 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -81,9 +81,7 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
 
     marker = uuid.uuid4().hex
     ps_http.post_tracing_event("info", marker)
-    _, marker_offset = wait_until(
-        10, 0.5, lambda: env.pageserver.assert_log_contains(marker, offset=None)
-    )
+    _, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None))
 
     log.info("run pagebench")
     duration_secs = 10
@@ -103,12 +101,11 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
     log.info("validate that we logged the throttling")
 
     wait_until(
-        10,
-        compaction_period / 10,
         lambda: env.pageserver.assert_log_contains(
             f".*{tenant_id}.*shard was throttled in the last n_seconds.*",
             offset=marker_offset,
         ),
+        timeout=compaction_period,
     )
 
     log.info("validate that the metric doesn't include throttle wait time")
diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index f6a7bfa1ad..706da1e35e 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -84,7 +84,7 @@ def wait_for_wal_ingest_metric(pageserver_http: PageserverHttpClient) -> float:
     # The metric gets initialised on the first update.
     # Retry a few times, but return 0 if it's stable.
     try:
-        return float(wait_until(3, 0.5, query))
+        return float(wait_until(query, timeout=2, interval=0.5))
     except Exception:
         return 0
 
@@ -131,7 +131,7 @@ def test_pageserver_small_inmemory_layers(
     wait_until_pageserver_is_caught_up(env, last_flush_lsns)
 
     # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data.
-    wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))
+    wait_until(lambda: assert_dirty_bytes_nonzero(env))
 
     ps_http_client = env.pageserver.http_client()
     total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client)
@@ -139,7 +139,7 @@ def test_pageserver_small_inmemory_layers(
     # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed,
     # such that there are zero bytes of ephemeral layer left on the pageserver
     log.info("Waiting for background checkpoints...")
-    wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0))
+    wait_until(lambda: assert_dirty_bytes(env, 0), timeout=2 * CHECKPOINT_TIMEOUT_SECONDS)
 
     # Zero ephemeral layer bytes does not imply that all the frozen layers were uploaded: they
     # must be uploaded to remain visible to the pageserver after restart.
@@ -180,7 +180,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
     wait_until_pageserver_is_caught_up(env, last_flush_lsns)
 
     # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data.
-    wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))
+    wait_until(lambda: assert_dirty_bytes_nonzero(env))
 
     # Stop the safekeepers, so that we cannot have any more WAL receiver connections
     for sk in env.safekeepers:
@@ -193,7 +193,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
     # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed,
     # such that there are zero bytes of ephemeral layer left on the pageserver
     log.info("Waiting for background checkpoints...")
-    wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0))
+    wait_until(lambda: assert_dirty_bytes(env, 0), timeout=2 * CHECKPOINT_TIMEOUT_SECONDS)
 
     # The code below verifies that we do not flush on the first write
     # after an idle period longer than the checkpoint timeout.
@@ -210,7 +210,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
         run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE)
     )
 
-    dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))
+    dirty_after_write = wait_until(lambda: assert_dirty_bytes_nonzero(env))
 
     # We shouldn't flush since we've just opened a new layer
     waited_for = 0
@@ -305,11 +305,11 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
     # Wait until enough layers have rolled that the amount of dirty data is under the threshold.
     # We do this indirectly via layer maps, rather than the dirty bytes metric, to avoid false-passing
     # if that metric isn't updated quickly enough to reflect the dirty bytes exceeding the limit.
-    wait_until(compaction_period_s * 2, 1, assert_bytes_rolled)
+    wait_until(assert_bytes_rolled, timeout=2 * compaction_period_s)
 
     # The end state should also have the reported metric under the limit
     def assert_dirty_data_limited():
         dirty_bytes = get_dirty_bytes(env)
         assert dirty_bytes < max_dirty_data
 
-    wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited())
+    wait_until(lambda: assert_dirty_data_limited(), timeout=2 * compaction_period_s)
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index 4bf5705517..835ccbd5d4 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -103,7 +103,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 
         raise AssertionError("No 'complete' metric yet")
 
-    wait_until(30, 1.0, assert_complete)
+    wait_until(assert_complete)
 
     # Expectation callbacks: arg t is sample value, arg p is the previous phase's sample value
     expectations = [
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index a264f4d3c9..1292682f9e 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -356,7 +356,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
         )
         assert destination_lsn >= origin_lsn
 
-    wait_until(100, 0.1, caught_up)
+    wait_until(caught_up)
 
     # The destination should accept writes
     workload.churn_rows(64, pageserver_b.id)
@@ -411,7 +411,7 @@ def test_live_migration(neon_env_builder: NeonEnvBuilder):
         assert submitted is not None
         assert submitted > 0
 
-    wait_until(10, 0.1, blocked_deletions_drained)
+    wait_until(blocked_deletions_drained)
 
     workload.churn_rows(64, pageserver_b.id)
     workload.validate(pageserver_b.id)
@@ -702,7 +702,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
         else:
             timeout = int(deadline - now) + 1
             try:
-                wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression))
+                wait_until(lambda: pageserver.assert_log_contains(expression), timeout=timeout)
             except:
                 log.error(f"Timed out waiting for '{expression}'")
                 raise
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index 70d558ac5a..c13bea7ee1 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -215,8 +215,6 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
 
         # wait for lease renewal before running query.
         _, offset = wait_until(
-            20,
-            0.5,
             lambda: ep_static.assert_log_contains(
                 "lsn_lease_bg_task.*Request succeeded", offset=offset
             ),
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 137e75f784..76a42ef4a2 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -300,9 +300,9 @@ def test_remote_storage_upload_queue_retries(
     print_gc_result(gc_result)
     assert gc_result["layers_removed"] > 0
 
-    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
-    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    wait_until(lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
+    wait_until(lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
+    wait_until(lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
 
     # let all future operations queue up
     configure_storage_sync_failpoints("return")
@@ -333,16 +333,28 @@ def test_remote_storage_upload_queue_retries(
     # wait for churn thread's data to get stuck in the upload queue
     # Exponential back-off in upload queue, so, gracious timeouts.
 
-    wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1))
-    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    wait_until(
+        lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0), timeout=30
+    )
+    wait_until(
+        lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1), timeout=30
+    )
+    wait_until(
+        lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0), timeout=30
+    )
 
     # unblock churn operations
     configure_storage_sync_failpoints("off")
 
-    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
-    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    wait_until(
+        lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0), timeout=30
+    )
+    wait_until(
+        lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0), timeout=30
+    )
+    wait_until(
+        lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0), timeout=30
+    )
 
     # The churn thread doesn't make progress once it blocks on the first wait_completion() call,
     # so, give it some time to wrap up.
@@ -580,7 +592,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
             > 0
         )
 
-    wait_until(200, 0.1, assert_compacted_and_uploads_queued)
+    wait_until(assert_compacted_and_uploads_queued)
 
     # Regardless, give checkpoint some time to block for good.
     # Not strictly necessary, but might help uncover failure modes in the future.
@@ -598,9 +610,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
         ]
     )
 
-    # Generous timeout, because currently deletions can get blocked waiting for compaction
-    # This can be reduced when https://github.com/neondatabase/neon/issues/4998 is fixed.
-    timeline_delete_wait_completed(client, tenant_id, timeline_id, iterations=30, interval=1)
+    timeline_delete_wait_completed(client, tenant_id, timeline_id)
 
     assert not timeline_path.exists()
 
@@ -826,22 +836,16 @@ def wait_upload_queue_empty(
     client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
 ):
     wait_until(
-        2,
-        1,
         lambda: assert_eq(
             get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"), 0
         ),
     )
     wait_until(
-        2,
-        1,
         lambda: assert_eq(
             get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload"), 0
         ),
     )
     wait_until(
-        2,
-        1,
         lambda: assert_eq(
             get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"), 0
         ),
diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py
index 8e7c01f950..e2a22cc769 100644
--- a/test_runner/regress/test_replica_start.py
+++ b/test_runner/regress/test_replica_start.py
@@ -378,7 +378,7 @@ def test_replica_too_many_known_assigned_xids(neon_simple_env: NeonEnv):
             return None
         raise RuntimeError("connection succeeded")
 
-    wait_until(20, 0.5, check_replica_crashed)
+    wait_until(check_replica_crashed)
     assert secondary.log_contains("too many KnownAssignedXids")
 
     # Replica is crashed, so ignore stop result
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 411574bd86..c86ba0d4ea 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -836,7 +836,7 @@ def test_sharding_split_stripe_size(
         assert len(notifications) == 3
         assert notifications[2] == expect_after
 
-    wait_until(10, 1, assert_restart_notification)
+    wait_until(assert_restart_notification)
 
 
 # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
@@ -1025,7 +1025,7 @@ def test_sharding_ingest_gaps(
             assert Lsn(timeline_detail["disk_consistent_lsn"]) >= expect_lsn
 
     # We set a short checkpoint timeout: expect things to get frozen+flushed within that
-    wait_until(checkpoint_interval_secs * 3, 1, assert_all_disk_consistent)
+    wait_until(assert_all_disk_consistent, timeout=3 * checkpoint_interval_secs)
 
     def assert_all_remote_consistent():
         """
@@ -1037,7 +1037,7 @@ def test_sharding_ingest_gaps(
             assert Lsn(timeline_detail["remote_consistent_lsn"]) >= expect_lsn
 
     # We set a short checkpoint timeout: expect things to get frozen+flushed within that
-    wait_until(checkpoint_interval_secs * 3, 1, assert_all_remote_consistent)
+    wait_until(assert_all_remote_consistent, timeout=3 * checkpoint_interval_secs)
 
     workload.validate()
 
@@ -1405,14 +1405,14 @@ def test_sharding_split_failures(
         #   e.g. while waiting for a storage controller to re-attach a parent shard if we failed
         #   inside the pageserver and the storage controller responds by detaching children and attaching
         #   parents concurrently (https://github.com/neondatabase/neon/issues/7148)
-        wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False))
+        wait_until(lambda: workload.churn_rows(10, upload=False, ingest=False))
 
         workload.validate()
 
     if failure.fails_forward(env):
         log.info("Fail-forward failure, checking split eventually completes...")
         # A failure type which results in eventual completion of the split
-        wait_until(30, 1, assert_split_done)
+        wait_until(assert_split_done)
     elif failure.can_mitigate():
         log.info("Mitigating failure...")
         # Mitigation phase: we expect to be able to proceed with a successful shard split
@@ -1420,21 +1420,21 @@ def test_sharding_split_failures(
 
         # The split should appear to be rolled back from the point of view of all pageservers
         # apart from the one that is offline
-        wait_until(30, 1, lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))
+        wait_until(lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))
 
         finish_split()
-        wait_until(30, 1, lambda: assert_split_done(exclude_ps_id=failure.pageserver_id))
+        wait_until(lambda: assert_split_done(exclude_ps_id=failure.pageserver_id))
 
         # Having cleared the failure, everything should converge to a pristine state
         failure.clear(env)
-        wait_until(30, 1, assert_split_done)
+        wait_until(assert_split_done)
     else:
         # Once we restore the faulty pageserver's API to good health, rollback should
         # eventually complete.
         log.info("Clearing failure...")
         failure.clear(env)
 
-        wait_until(30, 1, assert_rolled_back)
+        wait_until(assert_rolled_back)
 
         # Having rolled back, the tenant should be working
         workload.churn_rows(10)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 13bc54a114..e93e251b4f 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -154,7 +154,7 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination)
         counts = get_node_shard_counts(env, tenant_ids)
         assert counts[node_id] == 0
 
-    wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))
+    wait_until(lambda: node_evacuated(env.pageservers[0].id))
 
     # Let all the reconciliations after marking the node offline complete
     env.storage_controller.reconcile_until_idle()
@@ -222,7 +222,7 @@ def test_node_status_after_restart(
     def is_ready():
         assert env.storage_controller.ready() is True
 
-    wait_until(30, 1, is_ready)
+    wait_until(is_ready)
 
     # We loaded nodes from database on restart
     nodes = env.storage_controller.node_list()
@@ -606,7 +606,7 @@ def test_storage_controller_compute_hook(
         counts = get_node_shard_counts(env, [env.initial_tenant])
         assert counts[node_id] == 0
 
-    wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))
+    wait_until(lambda: node_evacuated(env.pageservers[0].id))
 
     # Additional notification from migration
     log.info(f"notifications: {notifications}")
@@ -620,7 +620,7 @@ def test_storage_controller_compute_hook(
         assert len(notifications) == 2
         assert notifications[1] == expect
 
-    wait_until(20, 0.25, received_migration_notification)
+    wait_until(received_migration_notification)
 
     # When we restart, we should re-emit notifications for all tenants
     env.storage_controller.stop()
@@ -630,7 +630,7 @@ def test_storage_controller_compute_hook(
         assert len(notifications) == 3
         assert notifications[2] == expect
 
-    wait_until(10, 1, received_restart_notification)
+    wait_until(received_restart_notification)
 
     # Splitting a tenant should cause its stripe size to become visible in the compute notification
     env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2)
@@ -647,7 +647,7 @@ def test_storage_controller_compute_hook(
         assert len(notifications) == 4
         assert notifications[3] == expect
 
-    wait_until(10, 1, received_split_notification)
+    wait_until(received_split_notification)
 
     # If the compute hook is unavailable, that should not block creating a tenant and
     # creating a timeline.  This simulates a control plane refusing to accept notifications
@@ -736,7 +736,7 @@ def test_storage_controller_stuck_compute_hook(
         def logged_stuck():
             env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG)
 
-        wait_until(10, 0.25, logged_stuck)
+        wait_until(logged_stuck)
         contains_r = env.storage_controller.log_contains(NOTIFY_BLOCKED_LOG)
         assert contains_r is not None  # Appease mypy
         (_, log_cursor) = contains_r
@@ -764,7 +764,7 @@ def test_storage_controller_stuck_compute_hook(
         def logged_stuck_again():
             env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG, offset=log_cursor)
 
-        wait_until(10, 0.25, logged_stuck_again)
+        wait_until(logged_stuck_again)
         assert migrate_fut.running()
 
         # This time, the compute hook remains stuck, but we mark the origin node offline: this should
@@ -865,7 +865,7 @@ def test_storage_controller_compute_hook_revert(
         assert latest["shards"] is not None
         assert latest["shards"][0]["node_id"] == ps_id
 
-    wait_until(30, 1, lambda: notified_ps(pageserver_a.id))
+    wait_until(lambda: notified_ps(pageserver_a.id))
 
     env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
     env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
@@ -880,7 +880,7 @@ def test_storage_controller_compute_hook_revert(
 
     # Although the migration API failed, the hook should still see pageserver B (it remembers what
     # was posted even when returning an error code)
-    wait_until(30, 1, lambda: notified_ps(pageserver_b.id))
+    wait_until(lambda: notified_ps(pageserver_b.id))
 
     # Although the migration API failed, the tenant should still have moved to the right pageserver
     assert len(pageserver_b.http_client().tenant_list()) == 1
@@ -898,7 +898,7 @@ def test_storage_controller_compute_hook_revert(
     def logged_giving_up():
         env.storage_controller.assert_log_contains(".*Giving up on compute notification.*")
 
-    wait_until(30, 1, logged_giving_up)
+    wait_until(logged_giving_up)
 
     pageserver_a.start()
 
@@ -919,7 +919,7 @@ def test_storage_controller_compute_hook_revert(
     handle_params["status"] = 200
     env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_a.id)
 
-    wait_until(30, 1, lambda: notified_ps(pageserver_a.id))
+    wait_until(lambda: notified_ps(pageserver_a.id))
 
 
 def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
@@ -1453,7 +1453,7 @@ def test_storage_controller_heartbeats(
         # Check that each node got one tenant
         assert all(len(ts) == 1 for ts in node_to_tenants.values())
 
-    wait_until(10, 1, tenants_placed)
+    wait_until(tenants_placed)
 
     # ... then we apply the failure
     offline_node_ids = set(failure.nodes())
@@ -1476,7 +1476,7 @@ def test_storage_controller_heartbeats(
                 assert node["availability"] == "Offline"
 
     start = time.time()
-    wait_until(failure.offline_timeout, 1, nodes_offline)
+    wait_until(nodes_offline, timeout=failure.offline_timeout)
     detected_after = time.time() - start
     log.info(f"Detected node failures after {detected_after}s")
 
@@ -1497,7 +1497,7 @@ def test_storage_controller_heartbeats(
 
         assert observed_tenants == set(tenant_ids)
 
-    wait_until(10, 1, tenant_migrated)
+    wait_until(tenant_migrated)
 
     # ... then we clear the failure
     failure.clear(env)
@@ -1509,7 +1509,7 @@ def test_storage_controller_heartbeats(
             if node["id"] in online_node_ids:
                 assert node["availability"] == "Active"
 
-    wait_until(10, 1, nodes_online)
+    wait_until(nodes_online)
 
     time.sleep(5)
 
@@ -1562,7 +1562,7 @@ def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder):
     # We could pre-empty this by configuring the node to Offline, but it's preferable to test
     # the realistic path we would take when a node restarts uncleanly.
     # The delay here will be ~NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL in neon_local
-    wait_until(30, 1, failed_over)
+    wait_until(failed_over)
 
     reconciles_before_restart = env.storage_controller.get_metric_value(
         "storage_controller_reconcile_complete_total", filter={"status": "ok"}
@@ -1640,12 +1640,12 @@ def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBui
         assert e > n
         return e
 
-    errs = wait_until(10, 1, lambda: assert_errors_gt(0))
+    errs = wait_until(lambda: assert_errors_gt(0))
 
     # Try reconciling again, it should fail again
     with pytest.raises(StorageControllerApiException):
         env.storage_controller.reconcile_all()
-    errs = wait_until(10, 1, lambda: assert_errors_gt(errs))
+    errs = wait_until(lambda: assert_errors_gt(errs))
 
     # Configure the tenant to disable reconciles
     env.storage_controller.tenant_policy_update(
@@ -1674,7 +1674,7 @@ def test_storage_controller_shard_scheduling_policy(neon_env_builder: NeonEnvBui
         return o
 
     # We should see a successful reconciliation
-    wait_until(10, 1, lambda: assert_ok_gt(0))
+    wait_until(lambda: assert_ok_gt(0))
 
     # And indeed the tenant should be attached
     assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1
@@ -2073,7 +2073,7 @@ def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: P
             raise Exception(f"Secondary lag not big enough: {lag}")
 
     log.info(f"Looking for lag to develop on the secondary {secondary}")
-    wait_until(10, 1, secondary_is_lagging)
+    wait_until(secondary_is_lagging)
 
     log.info(f"Starting drain of primary {primary} with laggy secondary {secondary}")
     env.storage_controller.retryable_node_operation(
@@ -2107,7 +2107,7 @@ def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: P
         if lag > 1 * 1024 * 1024:
             raise Exception(f"Secondary lag not big enough: {lag}")
 
-    wait_until(10, 1, lag_is_acceptable)
+    wait_until(lag_is_acceptable)
 
     env.storage_controller.node_configure(primary, {"scheduling": "Active"})
 
@@ -2227,7 +2227,7 @@ def test_storage_controller_node_deletion(
             log.info(f"Shards on nodes other than on victim: {elsewhere}")
             assert elsewhere == tenant_count * shard_count_per_tenant
 
-        wait_until(30, 1, assert_shards_migrated)
+        wait_until(assert_shards_migrated)
 
     log.info(f"Deleting pageserver {victim.id}")
     env.storage_controller.node_delete(victim.id)
@@ -2240,7 +2240,7 @@ def test_storage_controller_node_deletion(
             log.info(f"Shards on node {victim.id}: {count}")
             assert count == 0
 
-        wait_until(30, 1, assert_victim_evacuated)
+        wait_until(assert_victim_evacuated)
 
     # The node should be gone from the list API
     assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
@@ -2569,7 +2569,7 @@ def test_storage_controller_leadership_transfer(
                 == StorageControllerLeadershipStatus.STEPPED_DOWN
             )
 
-        wait_until(5, 1, previous_stepped_down)
+        wait_until(previous_stepped_down)
 
     storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
 
@@ -2579,7 +2579,7 @@ def test_storage_controller_leadership_transfer(
             == StorageControllerLeadershipStatus.LEADER
         )
 
-    wait_until(15, 1, new_becomes_leader)
+    wait_until(new_becomes_leader)
     leader = env.storage_controller.get_leader()
     assert leader["address"] == f"http://127.0.0.1:{storage_controller_2_port}/"
 
@@ -2624,7 +2624,7 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB
     env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(10000)"))
     env.storage_controller.node_drain(attached.id)
 
-    wait_until(10, 0.5, attached_is_draining)
+    wait_until(attached_is_draining)
 
     attached.restart()
 
@@ -2646,7 +2646,7 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB
         env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"})
 
     # allow for small delay between actually having cancelled and being able reconfigure again
-    wait_until(4, 0.5, reconfigure_node_again)
+    wait_until(reconfigure_node_again)
 
 
 def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder):
@@ -2691,7 +2691,7 @@ def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder)
                 ps.log_contains(f"at failpoint {failpoint}") is not None for ps in env.pageservers
             )
 
-        wait_until(10, 1, has_hit_failpoint)
+        wait_until(has_hit_failpoint)
 
         # Migrate the tenant while the timeline creation is in progress: this migration will complete once it
         # can detach from the old pageserver, which will happen once the failpoint completes.
@@ -2775,7 +2775,7 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB
             def has_hit_compaction_failpoint():
                 assert origin_pageserver.log_contains(f"at failpoint {compaction_failpoint}")
 
-            wait_until(10, 1, has_hit_compaction_failpoint)
+            wait_until(has_hit_compaction_failpoint)
 
             # While the compaction is running, start a live migration which will pause long enough for the compaction to sleep,
             # after incrementing generation and attaching the new location
@@ -2794,7 +2794,7 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB
             # before it reaches this point.  The timeout is because the AttachedStale transition includes
             # a flush of remote storage, and if the compaction already enqueued an index upload this cannot
             # make progress.
-            wait_until(60, 1, has_hit_migration_failpoint)
+            wait_until(has_hit_migration_failpoint, timeout=60)
 
             # Origin pageserver has succeeded with compaction before the migration completed. It has done all the writes it wanted to do in its own (stale) generation
             origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off"))
@@ -2917,7 +2917,7 @@ def test_storage_controller_proxy_during_migration(
                 log.info(expr)
                 assert env.storage_controller.log_contains(expr)
 
-            wait_until(10, 1, has_hit_migration_failpoint)
+            wait_until(has_hit_migration_failpoint)
 
             # This request should be routed to whichever pageserver holds the highest generation
             tenant_info = env.storage_controller.pageserver_api().tenant_status(
@@ -2934,7 +2934,7 @@ def test_storage_controller_proxy_during_migration(
                 # We expect request to land on the origin
                 assert tenant_info["generation"] == 1
 
-            wait_until(10, 1, long_migration_metric_published)
+            wait_until(long_migration_metric_published)
 
             # Eventually migration completes
             env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
@@ -3113,7 +3113,7 @@ def test_timeline_delete_mid_live_migration(neon_env_builder: NeonEnvBuilder, mi
                 log.info(expr)
                 assert env.storage_controller.log_contains(expr)
 
-            wait_until(10, 1, has_hit_migration_failpoint)
+            wait_until(has_hit_migration_failpoint)
 
             env.storage_controller.pageserver_api().timeline_delete(
                 tenant_id=tenant_id, timeline_id=timeline_id
@@ -3182,7 +3182,7 @@ def test_multi_attached_timeline_creation(neon_env_builder: NeonEnvBuilder, migr
                 log.info(expr)
                 assert env.storage_controller.log_contains(expr)
 
-            wait_until(10, 1, has_hit_migration_failpoint)
+            wait_until(has_hit_migration_failpoint)
 
             timeline_id = TimelineId.generate()
             env.storage_controller.pageserver_api().timeline_create(
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 3991bd7061..b16dc54c24 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -431,8 +431,6 @@ def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder):
 
         # Let the controller reach the failpoint
         wait_until(
-            10,
-            1,
             lambda: env.storage_controller.assert_log_contains(
                 'failpoint "shard-split-post-remote-sleep": sleeping'
             ),
diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py
index d37eeb1e6e..7d4f66d044 100644
--- a/test_runner/regress/test_subscriber_restart.py
+++ b/test_runner/regress/test_subscriber_restart.py
@@ -56,4 +56,4 @@ def test_subscriber_restart(neon_simple_env: NeonEnv):
         pcur.execute(f"INSERT into t values ({n_records}, 0)")
         n_records += 1
         with sub.cursor() as scur:
-            wait_until(60, 0.5, check_that_changes_propagated)
+            wait_until(check_that_changes_propagated)
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 1dd46ec3d1..f8f240cfdc 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -234,11 +234,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
     assert not config_path.exists(), "detach did not remove config file"
 
     env.pageserver.tenant_attach(tenant_id)
-    wait_until(
-        number_of_iterations=5,
-        interval=1,
-        func=lambda: assert_tenant_state(http_client, tenant_id, "Active"),
-    )
+    wait_until(lambda: assert_tenant_state(http_client, tenant_id, "Active"))
 
     env.config_tenant(tenant_id, {"gc_horizon": "1000000"})
     contents_first = config_path.read_text()
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 47df3ead70..48e55c1ab1 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -185,21 +185,21 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
     deletion = None
 
     try:
-        wait_until(10, 1, has_hit_failpoint)
+        wait_until(has_hit_failpoint)
 
         # it should start ok, sync up with the stuck creation, then hang waiting for the timeline
         # to shut down.
         deletion = Thread(target=start_deletion)
         deletion.start()
 
-        wait_until(10, 1, deletion_has_started_waiting_for_timelines)
+        wait_until(deletion_has_started_waiting_for_timelines)
 
         pageserver_http.configure_failpoints((failpoint, "off"))
 
         creation.join()
         deletion.join()
 
-        wait_until(10, 1, tenant_is_deleted)
+        wait_until(tenant_is_deleted)
     finally:
         creation.join()
         if deletion is not None:
@@ -264,7 +264,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
     def hit_initdb_upload_failpoint():
         env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
 
-    wait_until(100, 0.1, hit_initdb_upload_failpoint)
+    wait_until(hit_initdb_upload_failpoint)
 
     def creation_connection_timed_out():
         env.pageserver.assert_log_contains(
@@ -273,7 +273,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
 
     # Wait so that we hit the timeout and the connection is dropped
     # (But timeline creation still continues)
-    wait_until(100, 0.1, creation_connection_timed_out)
+    wait_until(creation_connection_timed_out)
 
     ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
 
@@ -281,7 +281,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
         def tenant_delete_inner():
             ps_http.tenant_delete(tenant_id)
 
-        wait_until(100, 0.5, tenant_delete_inner)
+        wait_until(tenant_delete_inner)
 
     Thread(target=tenant_delete).start()
 
@@ -290,7 +290,7 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
             f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
         )
 
-    wait_until(100, 0.1, deletion_arrived)
+    wait_until(deletion_arrived)
 
     ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off"))
 
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 8d7ca7bc4e..3f21dc895a 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -212,7 +212,7 @@ def test_tenant_reattach_while_busy(
         nonlocal updates_started, updates_finished, updates_to_perform
 
         # Wait until we have performed some updates
-        wait_until(20, 0.5, lambda: updates_finished > 500)
+        wait_until(lambda: updates_finished > 500)
 
         log.info("Detaching tenant")
         pageserver_http.tenant_detach(tenant_id)
@@ -512,7 +512,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
         )
         assert only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
 
-    wait_until(10, 0.5, found_broken)
+    wait_until(found_broken)
 
     client.tenant_detach(env.initial_tenant)
 
@@ -524,7 +524,7 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
         )
         assert only_int(broken) == 0 and len(broken_set) == 0
 
-    wait_until(10, 0.5, found_cleaned_up)
+    wait_until(found_cleaned_up)
 
     env.pageserver.tenant_attach(env.initial_tenant)
 
@@ -536,4 +536,4 @@ def test_metrics_while_ignoring_broken_tenant_and_reloading(
         )
         assert only_int(active) == 1 and len(broken_set) == 0
 
-    wait_until(10, 0.5, found_active)
+    wait_until(found_active)
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index bf6120aa0a..df53a98e92 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -298,11 +298,7 @@ def test_tenant_relocation(
         destination_ps.tenant_attach(tenant_id)
 
         # wait for tenant to finish attaching
-        wait_until(
-            number_of_iterations=10,
-            interval=1,
-            func=lambda: assert_tenant_state(destination_http, tenant_id, "Active"),
-        )
+        wait_until(lambda: assert_tenant_state(destination_http, tenant_id, "Active"))
 
         check_timeline_attached(
             destination_http,
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 8b733da0c6..713f89c60f 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -638,7 +638,7 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
     with ThreadPoolExecutor(max_workers=1) as exec:
         completion = exec.submit(client.tenant_size, env.initial_tenant)
         _, last_offset = wait_until(
-            10, 1.0, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+            lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
         )
 
         timeline_delete_wait_completed(client, env.initial_tenant, branch_id)
@@ -656,8 +656,6 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
     with ThreadPoolExecutor(max_workers=1) as exec:
         completion = exec.submit(client.tenant_size, env.initial_tenant)
         wait_until(
-            10,
-            1.0,
             lambda: env.pageserver.assert_log_contains(
                 f"at failpoint {failpoint}", offset=last_offset
             ),
diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py
index 72183f5778..4c26b64d22 100644
--- a/test_runner/regress/test_tenant_tasks.py
+++ b/test_runner/regress/test_tenant_tasks.py
@@ -77,4 +77,4 @@ def test_tenant_tasks(neon_env_builder: NeonEnvBuilder):
         assert tasks_started == tasks_ended
         assert tasks_panicked is None or int(tasks_panicked) == 0
 
-    wait_until(10, 0.2, assert_tasks_finish)
+    wait_until(assert_tasks_finish)
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 158c3fddb0..d31901b384 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -330,7 +330,7 @@ def test_pageserver_with_empty_tenants(neon_env_builder: NeonEnvBuilder):
         assert len(tenants) == 1
         assert all(t["state"]["slug"] != "Attaching" for t in tenants)
 
-    wait_until(10, 0.2, not_attaching)
+    wait_until(not_attaching)
 
     tenants = client.tenant_list()
 
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 8d3ddf7e54..6b27c41d1c 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -178,11 +178,7 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder):
     env.pageserver.start()
     client = env.pageserver.http_client()
 
-    wait_until(
-        number_of_iterations=5,
-        interval=1,
-        func=lambda: assert_tenant_state(client, tenant_id, "Active"),
-    )
+    wait_until(lambda: assert_tenant_state(client, tenant_id, "Active"))
 
     restored_timelines = client.timeline_list(tenant_id)
     assert (
@@ -257,11 +253,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
     env.pageserver.start()
     client = env.pageserver.http_client()
 
-    wait_until(
-        number_of_iterations=5,
-        interval=1,
-        func=lambda: assert_tenant_state(client, tenant_id, "Active"),
-    )
+    wait_until(lambda: assert_tenant_state(client, tenant_id, "Active"))
 
     restored_timelines = client.timeline_list(tenant_id)
     assert (
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index bc2e048f69..5a1e493bbe 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -227,8 +227,8 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
             ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id)
         assert timeline_offloaded_logged(leaf_timeline_id)
 
-    wait_until(30, 1, leaf_offloaded)
-    wait_until(30, 1, parent_offloaded)
+    wait_until(leaf_offloaded)
+    wait_until(parent_offloaded)
 
     # Offloaded child timelines should still prevent deletion
     with pytest.raises(
@@ -331,7 +331,7 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel
         ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id)
         assert timeline_offloaded_api(child_timeline_id)
 
-    wait_until(30, 1, child_offloaded)
+    wait_until(child_offloaded)
 
     assert timeline_offloaded_api(child_timeline_id)
     assert not timeline_offloaded_api(root_timeline_id)
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 155709e106..fbece68367 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -21,7 +21,6 @@ from fixtures.pageserver.utils import (
     assert_prefix_empty,
     assert_prefix_not_empty,
     many_small_layers_tenant_config,
-    poll_for_remote_storage_iterations,
     timeline_delete_wait_completed,
     wait_for_last_record_lsn,
     wait_for_upload,
@@ -94,12 +93,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
     assert timeline_path.exists()
 
     # retry deletes when compaction or gc is running in pageserver
-    # TODO: review whether this wait_until is actually necessary, we do an await() internally
-    wait_until(
-        number_of_iterations=3,
-        interval=0.2,
-        func=lambda: timeline_delete_wait_completed(ps_http, env.initial_tenant, leaf_timeline_id),
-    )
+    timeline_delete_wait_completed(ps_http, env.initial_tenant, leaf_timeline_id)
 
     assert not timeline_path.exists()
 
@@ -111,13 +105,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
         ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
     assert exc.value.status_code == 404
 
-    wait_until(
-        number_of_iterations=3,
-        interval=0.2,
-        func=lambda: timeline_delete_wait_completed(
-            ps_http, env.initial_tenant, parent_timeline_id
-        ),
-    )
+    timeline_delete_wait_completed(ps_http, env.initial_tenant, parent_timeline_id)
 
     # Check that we didn't pick up the timeline again after restart.
     # See https://github.com/neondatabase/neon/issues/3560
@@ -226,8 +214,6 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
 
     ps_http.configure_failpoints((failpoint, "return"))
 
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
     # These failpoints are earlier than background task is spawned.
     # so they result in api request failure.
     if failpoint in (
@@ -244,7 +230,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
             tenant_id=env.initial_tenant,
             timeline_id=timeline_id,
             expected_state="Broken",
-            iterations=iterations,
+            iterations=40,
         )
 
         reason = timeline_info["state"]["Broken"]["reason"]
@@ -257,25 +243,21 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
         env.pageserver.stop()
         env.pageserver.start()
 
-        wait_until_tenant_active(ps_http, env.initial_tenant, iterations=iterations)
+        wait_until_tenant_active(ps_http, env.initial_tenant)
 
         if failpoint == "timeline-delete-before-index-deleted-at":
             # We crashed before persisting this to remote storage, need to retry delete request
             timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id)
         else:
             # Pageserver should've resumed deletion after restart.
-            wait_timeline_detail_404(
-                ps_http, env.initial_tenant, timeline_id, iterations=iterations
-            )
+            wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id)
 
     elif check is Check.RETRY_WITHOUT_RESTART:
         # this should succeed
         # this also checks that delete can be retried even when timeline is in Broken state
         ps_http.configure_failpoints((failpoint, "off"))
 
-        timeline_delete_wait_completed(
-            ps_http, env.initial_tenant, timeline_id, iterations=iterations
-        )
+        timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id)
 
     # Check remote is empty
     if remote_storage_kind is RemoteStorageKind.MOCK_S3:
@@ -378,7 +360,7 @@ def test_timeline_resurrection_on_attach(
 
     env.pageserver.tenant_attach(tenant_id=tenant_id)
 
-    wait_until_tenant_active(ps_http, tenant_id=tenant_id, iterations=10, period=0.5)
+    wait_until_tenant_active(ps_http, tenant_id=tenant_id)
 
     timelines = ps_http.timeline_list(tenant_id=tenant_id)
     assert {TimelineId(tl["timeline_id"]) for tl in timelines} == {
@@ -439,7 +421,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
     # Wait for tenant to finish loading.
     wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=1)
 
-    wait_timeline_detail_404(ps_http, env.initial_tenant, leaf_timeline_id, iterations=4)
+    wait_timeline_detail_404(ps_http, env.initial_tenant, leaf_timeline_id)
 
     assert (
         not leaf_timeline_path.exists()
@@ -481,11 +463,10 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
         )
 
     # for some reason the check above doesnt immediately take effect for the below.
-    # Assume it is mock server incosistency and check twice.
+    # Assume it is mock server incosistency and check a few times.
     wait_until(
-        2,
-        0.5,
         lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage),
+        timeout=2,
     )
 
     # We deleted our only tenant, and the scrubber fails if it detects nothing
@@ -544,7 +525,7 @@ def test_concurrent_timeline_delete_stuck_on(
                 f".*{child_timeline_id}.*at failpoint {stuck_failpoint}"
             )
 
-        wait_until(50, 0.1, first_call_hit_failpoint)
+        wait_until(first_call_hit_failpoint, interval=0.1, status_interval=1.0)
 
         # make the second call and assert behavior
         log.info("second call start")
@@ -613,7 +594,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
     def hit_failpoint():
         env.pageserver.assert_log_contains(at_failpoint_log_message)
 
-    wait_until(50, 0.1, hit_failpoint)
+    wait_until(hit_failpoint, interval=0.1)
 
     # we log this error if a client hangs up
     # might as well use it as another indicator that the test works
@@ -623,7 +604,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
     def got_hangup_log_message():
         env.pageserver.assert_log_contains(hangup_log_message)
 
-    wait_until(50, 0.1, got_hangup_log_message)
+    wait_until(got_hangup_log_message, interval=0.1)
 
     # check that the timeline is still present
     ps_http.timeline_detail(env.initial_tenant, child_timeline_id)
@@ -635,10 +616,10 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
         message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished"
         env.pageserver.assert_log_contains(message)
 
-    wait_until(50, 0.1, first_request_finished)
+    wait_until(first_request_finished, interval=0.1)
 
     # check that the timeline is gone
-    wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=10)
+    wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id)
 
 
 def test_timeline_delete_works_for_remote_smoke(
@@ -707,7 +688,7 @@ def test_timeline_delete_works_for_remote_smoke(
 
     # for some reason the check above doesnt immediately take effect for the below.
     # Assume it is mock server inconsistency and check twice.
-    wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage))
+    wait_until(lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage))
 
     # We deleted our only tenant, and the scrubber fails if it detects nothing
     neon_env_builder.disable_scrub_on_exit()
@@ -753,15 +734,13 @@ def test_delete_orphaned_objects(
 
     env.pageserver.allowed_errors.append(f".*failpoint: {failpoint}")
 
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
     ps_http.timeline_delete(env.initial_tenant, timeline_id)
     timeline_info = wait_until_timeline_state(
         pageserver_http=ps_http,
         tenant_id=env.initial_tenant,
         timeline_id=timeline_id,
         expected_state="Broken",
-        iterations=iterations,
+        iterations=40,
     )
 
     reason = timeline_info["state"]["Broken"]["reason"]
@@ -827,8 +806,6 @@ def test_timeline_delete_resumed_on_attach(
         )
     )
 
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
     ps_http.timeline_delete(tenant_id, timeline_id)
 
     timeline_info = wait_until_timeline_state(
@@ -836,7 +813,7 @@ def test_timeline_delete_resumed_on_attach(
         tenant_id=env.initial_tenant,
         timeline_id=timeline_id,
         expected_state="Broken",
-        iterations=iterations,
+        iterations=40,
     )
 
     reason = timeline_info["state"]["Broken"]["reason"]
@@ -871,7 +848,7 @@ def test_timeline_delete_resumed_on_attach(
     env.pageserver.tenant_attach(tenant_id=tenant_id)
 
     # delete should be resumed
-    wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id, iterations=iterations)
+    wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id)
 
     tenant_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
     assert not tenant_path.exists()
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 9c7e851ba8..2c3ee38bae 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -203,7 +203,7 @@ def test_ancestor_detach_branched_from(
     )
 
     client.timeline_delete(env.initial_tenant, env.initial_timeline)
-    wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
+    wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline)
 
     # because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different
     # as there is always "PREV_LSN: invalid" for "before"
@@ -336,10 +336,10 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
 
     # delete the timelines to confirm detach actually worked
     client.timeline_delete(env.initial_tenant, after)
-    wait_timeline_detail_404(client, env.initial_tenant, after, 10, 1.0)
+    wait_timeline_detail_404(client, env.initial_tenant, after)
 
     client.timeline_delete(env.initial_tenant, env.initial_timeline)
-    wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
+    wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline)
 
 
 def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder):
@@ -973,17 +973,17 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
     with ThreadPoolExecutor(max_workers=2) as pool:
         try:
             fut = pool.submit(detach_ancestor)
-            offset = wait_until(10, 1.0, at_failpoint)
+            offset = wait_until(at_failpoint)
 
             delete = pool.submit(start_delete)
 
-            offset = wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset))
+            offset = wait_until(lambda: at_waiting_on_gate_close(offset))
 
             victim_http.configure_failpoints((pausepoint, "off"))
 
             delete.result()
 
-            assert wait_until(10, 1.0, is_deleted), f"unimplemented mode {mode}"
+            assert wait_until(is_deleted), f"unimplemented mode {mode}"
 
             # TODO: match the error
             with pytest.raises(PageserverApiException) as exc:
@@ -1115,11 +1115,11 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
     with ThreadPoolExecutor(max_workers=1) as pool:
         try:
             fut = pool.submit(detach_timeline)
-            wait_until(10, 1.0, paused_at_failpoint)
+            wait_until(paused_at_failpoint)
 
             # let stuck complete
             stuck_http.configure_failpoints((pausepoint, "off"))
-            wait_until(10, 1.0, first_completed)
+            wait_until(first_completed)
 
             if mode == "delete_reparentable_timeline":
                 assert first_branch is not None
@@ -1127,7 +1127,7 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
                     env.initial_tenant, first_branch
                 )
                 victim_http.configure_failpoints((pausepoint, "off"))
-                wait_until(10, 1.0, first_branch_gone)
+                wait_until(first_branch_gone)
             elif mode == "create_reparentable_timeline":
                 first_branch = create_reparentable_timeline()
                 victim_http.configure_failpoints((pausepoint, "off"))
@@ -1271,11 +1271,11 @@ def test_retryable_500_hit_through_storcon_during_timeline_detach_ancestor(
     with ThreadPoolExecutor(max_workers=1) as pool:
         try:
             fut = pool.submit(detach_timeline)
-            wait_until(10, 1.0, paused_at_failpoint)
+            wait_until(paused_at_failpoint)
 
             # let stuck complete
             stuck_http.configure_failpoints((pausepoint, "off"))
-            wait_until(10, 1.0, first_completed)
+            wait_until(first_completed)
 
             victim_http.configure_failpoints((pausepoint, "off"))
 
@@ -1456,7 +1456,7 @@ def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: Neon
     # other tests take the "detach? reparent complete", but this only hits
     # "complete".
     http.timeline_delete(env.initial_tenant, env.initial_timeline)
-    wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline, 20)
+    wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline)
 
     http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "off"))
 
@@ -1518,7 +1518,7 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes(
         with ThreadPoolExecutor(max_workers=1) as pool:
             detach = pool.submit(detach_and_get_stuck)
 
-            offset = wait_until(10, 1.0, request_processing_noted_in_log)
+            offset = wait_until(request_processing_noted_in_log)
 
             # make this named fn tor more clear failure test output logging
             def pausepoint_hit_with_gc_paused() -> LogCursor:
@@ -1529,11 +1529,11 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes(
                 )
                 return at
 
-            offset = wait_until(10, 1.0, pausepoint_hit_with_gc_paused)
+            offset = wait_until(pausepoint_hit_with_gc_paused)
 
             delete_detached()
 
-            wait_timeline_detail_404(http, env.initial_tenant, detached, 10, 1.0)
+            wait_timeline_detail_404(http, env.initial_tenant, detached)
 
             http.configure_failpoints((failpoint, "off"))
 
diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py
index 5a5ca3290a..7605e1f758 100644
--- a/test_runner/regress/test_timeline_gc_blocking.py
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -61,7 +61,7 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool
 
     # deletion unblocks gc
     http.timeline_delete(env.initial_tenant, foo_branch)
-    wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0)
+    wait_timeline_detail_404(http, env.initial_tenant, foo_branch)
 
     wait_for_another_gc_round()
     pss.assert_log_contains(gc_active_line)
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 4528bc6180..95bf9106cd 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -396,11 +396,7 @@ def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder):
 
     # Wait for the tenant to be loaded
     client = env.pageserver.http_client()
-    wait_until(
-        number_of_iterations=5,
-        interval=1,
-        func=lambda: assert_tenant_state(client, env.initial_tenant, "Active"),
-    )
+    wait_until(lambda: assert_tenant_state(client, env.initial_tenant, "Active"))
 
     assert_physical_size_invariants(
         get_physical_size_values(env, env.initial_tenant, new_timeline_id),
@@ -433,7 +429,7 @@ def test_timeline_physical_size_post_checkpoint(neon_env_builder: NeonEnvBuilder
             get_physical_size_values(env, env.initial_tenant, new_timeline_id),
         )
 
-    wait_until(10, 1, check)
+    wait_until(check)
 
 
 def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder):
@@ -721,7 +717,7 @@ def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int
     def condition():
         assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count
 
-    wait_until(5, 1.0, condition)
+    wait_until(condition)
 
 
 def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
@@ -768,7 +764,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
         assert "Active" in set(get_tenant_states().values())
 
     # One tenant should activate, then get stuck in their logical size calculation
-    wait_until(10, 1, at_least_one_active)
+    wait_until(at_least_one_active)
 
     # Wait some walltime to gain confidence that other tenants really are stuck and not proceeding to activate
     time.sleep(5)
@@ -836,13 +832,13 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
     def all_active():
         assert all(s == "Active" for s in get_tenant_states().values())
 
-    wait_until(10, 1, all_active)
+    wait_until(all_active)
 
     # Final control check: restarting with no failpoints at all results in all tenants coming active
     # without being prompted by client I/O
     env.pageserver.stop()
     env.pageserver.start()
-    wait_until(10, 1, all_active)
+    wait_until(all_active)
 
     assert (
         pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
@@ -856,7 +852,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
         extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"}
     )
 
-    wait_until(10, 1, at_least_one_active)
+    wait_until(at_least_one_active)
 
     detach_tenant_id = list(
         [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
@@ -881,7 +877,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
 
     # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one
     # we detached)
-    wait_until(10, 1, all_active)
+    wait_until(all_active)
     assert len(get_tenant_states()) == n_tenants - 2
 
 
@@ -908,7 +904,7 @@ def delete_lazy_activating(
         try:
             # Deletion will get to the point in shutdown where it's waiting for timeline shutdown, then
             # hang because of our failpoint blocking activation.
-            wait_until(10, 1, shutting_down)
+            wait_until(shutting_down)
         finally:
             log.info("Clearing failpoint")
             pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
@@ -1030,13 +1026,13 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
         log.info(f"{states}")
         assert len(states["Active"]) == 1
 
-    wait_until(10, 1, one_is_active)
+    wait_until(one_is_active)
 
     def other_is_attaching():
         states = get_tenant_states()
         assert len(states["Attaching"]) == 1
 
-    wait_until(10, 1, other_is_attaching)
+    wait_until(other_is_attaching)
 
     def eager_tenant_is_active():
         resp = client.tenant_status(eager_tenant)
@@ -1053,7 +1049,7 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder):
         },
         lazy=False,
     )
-    wait_until(10, 1, eager_tenant_is_active)
+    wait_until(eager_tenant_is_active)
 
     other_is_attaching()
 
@@ -1096,7 +1092,7 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met
         resp = client.tenant_status(env.initial_tenant)
         assert resp["state"]["slug"] == "Active"
 
-    wait_until(10, 1, initial_tenant_is_active)
+    wait_until(initial_tenant_is_active)
 
     # even though the initial tenant is now active, because it was startup time
     # attach, it will consume the only permit because logical size calculation
@@ -1119,7 +1115,7 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met
         assert resp["state"]["slug"] == "Attaching"
 
     # paused logical size calculation of env.initial_tenant is keeping it attaching
-    wait_until(10, 1, lazy_tenant_is_attaching)
+    wait_until(lazy_tenant_is_attaching)
 
     for _ in range(5):
         lazy_tenant_is_attaching()
@@ -1132,10 +1128,10 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met
     if activation_method == "endpoint":
         with env.endpoints.create_start("main", tenant_id=lazy_tenant):
             # starting up the endpoint should make it jump the queue
-            wait_until(10, 1, lazy_tenant_is_active)
+            wait_until(lazy_tenant_is_active)
     elif activation_method == "branch":
         env.create_timeline("second_branch", lazy_tenant)
-        wait_until(10, 1, lazy_tenant_is_active)
+        wait_until(lazy_tenant_is_active)
     elif activation_method == "delete":
         delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True)
     else:
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 8fa33b81a9..23d4f23cdb 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2136,7 +2136,7 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
         # Check that on source no segment files are present
         assert src_sk.list_segments(tenant_id, timeline_id) == []
 
-    wait_until(60, 1, evicted_on_source)
+    wait_until(evicted_on_source, timeout=60)
 
     # Invoke pull_timeline: source should serve snapshot request without promoting anything to local disk,
     # destination should import the control file only & go into evicted mode immediately
@@ -2155,7 +2155,7 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
 
     # This should be fast, it is a wait_until because eviction state is updated
     # in the background wrt pull_timeline.
-    wait_until(10, 0.1, evicted_on_destination)
+    wait_until(evicted_on_destination, timeout=1.0, interval=0.1)
 
     # Delete the timeline on the source, to prove that deletion works on an
     # evicted timeline _and_ that the final compute test is really not using
@@ -2178,7 +2178,7 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
         n_evicted = dst_sk.http_client().get_metric_value("safekeeper_evicted_timelines")
         assert n_evicted == 0
 
-    wait_until(10, 1, unevicted_on_dest)
+    wait_until(unevicted_on_dest, interval=0.1, timeout=1.0)
 
 
 # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
@@ -2606,10 +2606,10 @@ def test_s3_eviction(
         assert n_evicted  # make mypy happy
         assert int(n_evicted) == n_timelines
 
-    wait_until(60, 0.5, all_evicted)
+    wait_until(all_evicted, timeout=30)
     # restart should preserve the metric value
     sk.stop().start()
-    wait_until(60, 0.5, all_evicted)
+    wait_until(all_evicted)
     # and endpoint start should reduce is
     endpoints[0].start()
 
@@ -2618,7 +2618,7 @@ def test_s3_eviction(
         assert n_evicted  # make mypy happy
         assert int(n_evicted) < n_timelines
 
-    wait_until(60, 0.5, one_unevicted)
+    wait_until(one_unevicted)
 
 
 # Test resetting uploaded partial segment state.
@@ -2666,7 +2666,7 @@ def test_backup_partial_reset(neon_env_builder: NeonEnvBuilder):
         if isinstance(eviction_state, str) and eviction_state == "Present":
             raise Exception("eviction didn't happen yet")
 
-    wait_until(30, 1, evicted)
+    wait_until(evicted)
     # it must have uploaded something
     uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id)
     log.info(f"uploaded segments before reset: {uploaded_segs}")
@@ -2763,7 +2763,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
 
         raise Exception("Partial segment not uploaded yet")
 
-    source_partial_segment = wait_until(15, 1, source_partial_segment_uploaded)
+    source_partial_segment = wait_until(source_partial_segment_uploaded)
     log.info(
         f"Uploaded segments before pull are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
     )
@@ -2787,7 +2787,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
         if evictions is None or evictions == 0:
             raise Exception("Eviction did not happen on source safekeeper yet")
 
-    wait_until(30, 1, evicted)
+    wait_until(evicted)
 
     endpoint.start(safekeepers=[2, 3])
 
@@ -2804,7 +2804,7 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
     )
 
     endpoint.safe_psql("insert into t select generate_series(1, 1000), 'pear'")
-    wait_until(15, 1, new_partial_segment_uploaded)
+    wait_until(new_partial_segment_uploaded)
 
     log.info(
         f"Uploaded segments after post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
@@ -2833,4 +2833,4 @@ def test_pull_timeline_partial_segment_integrity(neon_env_builder: NeonEnvBuilde
         if unevictions is None or unevictions == 0:
             raise Exception("Uneviction did not happen on source safekeeper yet")
 
-    wait_until(10, 1, unevicted)
+    wait_until(unevicted)
diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py
index 294f86ffa7..d22a900c59 100644
--- a/test_runner/regress/test_wal_receiver.py
+++ b/test_runner/regress/test_wal_receiver.py
@@ -97,7 +97,7 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
                     str(safekeeper.id) in exception_string
                 ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout"
 
-    wait_until(60, 0.5, all_sks_in_wareceiver_state)
+    wait_until(all_sks_in_wareceiver_state, timeout=30)
 
     stopped_safekeeper = env.safekeepers[-1]
     stopped_safekeeper_id = stopped_safekeeper.id
@@ -124,7 +124,7 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
                         str(safekeeper.id) in exception_string
                     ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
 
-    wait_until(60, 0.5, all_but_stopped_sks_in_wareceiver_state)
+    wait_until(all_but_stopped_sks_in_wareceiver_state, timeout=30)
 
 
 def insert_test_elements(env: NeonEnv, tenant_id: TenantId, start: int, count: int):

From 9ce0dd4e5511573487120cac763d22633a07d40a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 2 Dec 2024 11:50:22 +0000
Subject: [PATCH 154/209] storcon: add metric for AZ scheduling violations
 (#9949)

## Problem

We can't easily tell how far the state of shards is from their AZ
preferences. This can be a cause of performance issues, so it's
important for diagnosability that we can tell easily if there are
significant numbers of shards that aren't running in their preferred AZ.

Related: https://github.com/neondatabase/cloud/issues/15413

## Summary of changes

- In reconcile_all, count shards that are scheduled into the wrong AZ
(if they have a preference), and publish it as a prometheus gauge.
- Also calculate a statistic for how many shards wanted to reconcile but
couldn't.

This is clearly a lazy calculation: reconcile all only runs
periodically. But that's okay: shards in the wrong AZ is something that
only matters if it stays that way for some period of time.
---
 storage_controller/src/metrics.rs |  6 ++++++
 storage_controller/src/service.rs | 32 +++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index a1f7bc2457..6d5885eba6 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -50,6 +50,12 @@ pub(crate) struct StorageControllerMetricGroup {
     /// Count of how many times we make an optimization change to a tenant's scheduling
     pub(crate) storage_controller_schedule_optimization: measured::Counter,
 
+    /// How many shards are not scheduled into their preferred AZ
+    pub(crate) storage_controller_schedule_az_violation: measured::Gauge,
+
+    /// How many shards would like to reconcile but were blocked by concurrency limits
+    pub(crate) storage_controller_pending_reconciles: measured::Gauge,
+
     /// HTTP request status counters for handled requests
     pub(crate) storage_controller_http_request_status:
         measured::CounterVec<HttpRequestStatusLabelGroupSet>,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 636ccf11a1..631fdb4923 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -6016,14 +6016,33 @@ impl Service {
         let (nodes, tenants, _scheduler) = locked.parts_mut();
         let pageservers = nodes.clone();
 
+        // This function is an efficient place to update lazy statistics, since we are walking
+        // all tenants.
+        let mut pending_reconciles = 0;
+        let mut az_violations = 0;
+
         let mut reconciles_spawned = 0;
         for shard in tenants.values_mut() {
+            // Accumulate scheduling statistics
+            if let (Some(attached), Some(preferred)) =
+                (shard.intent.get_attached(), shard.preferred_az())
+            {
+                let node_az = nodes
+                    .get(attached)
+                    .expect("Nodes exist if referenced")
+                    .get_availability_zone_id();
+                if node_az != preferred {
+                    az_violations += 1;
+                }
+            }
+
             // Skip checking if this shard is already enqueued for reconciliation
             if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 {
                 // If there is something delayed, then return a nonzero count so that
                 // callers like reconcile_all_now do not incorrectly get the impression
                 // that the system is in a quiescent state.
                 reconciles_spawned = std::cmp::max(1, reconciles_spawned);
+                pending_reconciles += 1;
                 continue;
             }
 
@@ -6031,9 +6050,22 @@ impl Service {
             // dirty, spawn another rone
             if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
                 reconciles_spawned += 1;
+            } else if shard.delayed_reconcile {
+                // Shard wanted to reconcile but for some reason couldn't.
+                pending_reconciles += 1;
             }
         }
 
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_schedule_az_violation
+            .set(az_violations as i64);
+
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_pending_reconciles
+            .set(pending_reconciles as i64);
+
         reconciles_spawned
     }
 

From a750c14735502019a91055d46351d417ccd2bec1 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 2 Dec 2024 12:29:57 +0000
Subject: [PATCH 155/209] fix(proxy): forward notifications from authentication
 (#9948)

Fixes https://github.com/neondatabase/cloud/issues/20973.

This refactors `connect_raw` in order to return direct access to the
delayed notices.

I cannot find a way to test this with psycopg2 unfortunately, although
testing it with psql does return the expected results.
---
 libs/pq_proto/src/lib.rs                      |  6 +++
 .../postgres-protocol2/src/message/backend.rs |  4 ++
 .../proxy/tokio-postgres2/src/cancel_token.rs |  8 ++--
 libs/proxy/tokio-postgres2/src/client.rs      | 13 +++---
 libs/proxy/tokio-postgres2/src/config.rs      |  6 +--
 libs/proxy/tokio-postgres2/src/connect.rs     | 42 +++++++++++++++----
 libs/proxy/tokio-postgres2/src/connect_raw.rs | 37 +++++++++-------
 libs/proxy/tokio-postgres2/src/lib.rs         |  5 ++-
 proxy/src/compute.rs                          | 38 ++++++++++++-----
 proxy/src/proxy/mod.rs                        |  8 ++--
 proxy/src/proxy/tests/mod.rs                  |  8 ++--
 11 files changed, 117 insertions(+), 58 deletions(-)

diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 4b0331999d..43dfbc22a4 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -565,6 +565,8 @@ pub enum BeMessage<'a> {
     /// Batch of interpreted, shard filtered WAL records,
     /// ready for the pageserver to ingest
     InterpretedWalRecords(InterpretedWalRecordsBody<'a>),
+
+    Raw(u8, &'a [u8]),
 }
 
 /// Common shorthands.
@@ -754,6 +756,10 @@ impl BeMessage<'_> {
     /// one more buffer.
     pub fn write(buf: &mut BytesMut, message: &BeMessage) -> Result<(), ProtocolError> {
         match message {
+            BeMessage::Raw(code, data) => {
+                buf.put_u8(*code);
+                write_body(buf, |b| b.put_slice(data))
+            }
             BeMessage::AuthenticationOk => {
                 buf.put_u8(b'R');
                 write_body(buf, |buf| {
diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs
index 356d142f3f..33d77fc252 100644
--- a/libs/proxy/postgres-protocol2/src/message/backend.rs
+++ b/libs/proxy/postgres-protocol2/src/message/backend.rs
@@ -541,6 +541,10 @@ impl NoticeResponseBody {
     pub fn fields(&self) -> ErrorFields<'_> {
         ErrorFields { buf: &self.storage }
     }
+
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.storage
+    }
 }
 
 pub struct NotificationResponseBody {
diff --git a/libs/proxy/tokio-postgres2/src/cancel_token.rs b/libs/proxy/tokio-postgres2/src/cancel_token.rs
index b949bf358f..a10e8bf5c3 100644
--- a/libs/proxy/tokio-postgres2/src/cancel_token.rs
+++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs
@@ -10,10 +10,10 @@ use tokio::net::TcpStream;
 /// connection.
 #[derive(Clone)]
 pub struct CancelToken {
-    pub(crate) socket_config: Option<SocketConfig>,
-    pub(crate) ssl_mode: SslMode,
-    pub(crate) process_id: i32,
-    pub(crate) secret_key: i32,
+    pub socket_config: Option<SocketConfig>,
+    pub ssl_mode: SslMode,
+    pub process_id: i32,
+    pub secret_key: i32,
 }
 
 impl CancelToken {
diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs
index 96200b71e7..a7cd53afc3 100644
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -138,7 +138,7 @@ impl InnerClient {
 }
 
 #[derive(Clone)]
-pub(crate) struct SocketConfig {
+pub struct SocketConfig {
     pub host: Host,
     pub port: u16,
     pub connect_timeout: Option<Duration>,
@@ -152,7 +152,7 @@ pub(crate) struct SocketConfig {
 pub struct Client {
     inner: Arc<InnerClient>,
 
-    socket_config: Option<SocketConfig>,
+    socket_config: SocketConfig,
     ssl_mode: SslMode,
     process_id: i32,
     secret_key: i32,
@@ -161,6 +161,7 @@ pub struct Client {
 impl Client {
     pub(crate) fn new(
         sender: mpsc::UnboundedSender<Request>,
+        socket_config: SocketConfig,
         ssl_mode: SslMode,
         process_id: i32,
         secret_key: i32,
@@ -172,7 +173,7 @@ impl Client {
                 buffer: Default::default(),
             }),
 
-            socket_config: None,
+            socket_config,
             ssl_mode,
             process_id,
             secret_key,
@@ -188,10 +189,6 @@ impl Client {
         &self.inner
     }
 
-    pub(crate) fn set_socket_config(&mut self, socket_config: SocketConfig) {
-        self.socket_config = Some(socket_config);
-    }
-
     /// Creates a new prepared statement.
     ///
     /// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc),
@@ -412,7 +409,7 @@ impl Client {
     /// connection associated with this client.
     pub fn cancel_token(&self) -> CancelToken {
         CancelToken {
-            socket_config: self.socket_config.clone(),
+            socket_config: Some(self.socket_config.clone()),
             ssl_mode: self.ssl_mode,
             process_id: self.process_id,
             secret_key: self.secret_key,
diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
index 969c20ba47..26124b38ef 100644
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -2,6 +2,7 @@
 
 use crate::connect::connect;
 use crate::connect_raw::connect_raw;
+use crate::connect_raw::RawConnection;
 use crate::tls::MakeTlsConnect;
 use crate::tls::TlsConnect;
 use crate::{Client, Connection, Error};
@@ -485,14 +486,11 @@ impl Config {
         connect(tls, self).await
     }
 
-    /// Connects to a PostgreSQL database over an arbitrary stream.
-    ///
-    /// All of the settings other than `user`, `password`, `dbname`, `options`, and `application_name` name are ignored.
     pub async fn connect_raw<S, T>(
         &self,
         stream: S,
         tls: T,
-    ) -> Result<(Client, Connection<S, T::Stream>), Error>
+    ) -> Result<RawConnection<S, T::Stream>, Error>
     where
         S: AsyncRead + AsyncWrite + Unpin,
         T: TlsConnect<S>,
diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs
index 7517fe0cde..98067d91f9 100644
--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -1,13 +1,16 @@
 use crate::client::SocketConfig;
+use crate::codec::BackendMessage;
 use crate::config::{Host, TargetSessionAttrs};
 use crate::connect_raw::connect_raw;
 use crate::connect_socket::connect_socket;
 use crate::tls::{MakeTlsConnect, TlsConnect};
-use crate::{Client, Config, Connection, Error, SimpleQueryMessage};
+use crate::{Client, Config, Connection, Error, RawConnection, SimpleQueryMessage};
 use futures_util::{future, pin_mut, Future, FutureExt, Stream};
+use postgres_protocol2::message::backend::Message;
 use std::io;
 use std::task::Poll;
 use tokio::net::TcpStream;
+use tokio::sync::mpsc;
 
 pub async fn connect<T>(
     mut tls: T,
@@ -60,7 +63,36 @@ where
     T: TlsConnect<TcpStream>,
 {
     let socket = connect_socket(host, port, config.connect_timeout).await?;
-    let (mut client, mut connection) = connect_raw(socket, tls, config).await?;
+    let RawConnection {
+        stream,
+        parameters,
+        delayed_notice,
+        process_id,
+        secret_key,
+    } = connect_raw(socket, tls, config).await?;
+
+    let socket_config = SocketConfig {
+        host: host.clone(),
+        port,
+        connect_timeout: config.connect_timeout,
+    };
+
+    let (sender, receiver) = mpsc::unbounded_channel();
+    let client = Client::new(
+        sender,
+        socket_config,
+        config.ssl_mode,
+        process_id,
+        secret_key,
+    );
+
+    // delayed notices are always sent as "Async" messages.
+    let delayed = delayed_notice
+        .into_iter()
+        .map(|m| BackendMessage::Async(Message::NoticeResponse(m)))
+        .collect();
+
+    let mut connection = Connection::new(stream, delayed, parameters, receiver);
 
     if let TargetSessionAttrs::ReadWrite = config.target_session_attrs {
         let rows = client.simple_query_raw("SHOW transaction_read_only");
@@ -102,11 +134,5 @@ where
         }
     }
 
-    client.set_socket_config(SocketConfig {
-        host: host.clone(),
-        port,
-        connect_timeout: config.connect_timeout,
-    });
-
     Ok((client, connection))
 }
diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs
index 80677af969..9c6f1a2552 100644
--- a/libs/proxy/tokio-postgres2/src/connect_raw.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs
@@ -3,27 +3,26 @@ use crate::config::{self, AuthKeys, Config, ReplicationMode};
 use crate::connect_tls::connect_tls;
 use crate::maybe_tls_stream::MaybeTlsStream;
 use crate::tls::{TlsConnect, TlsStream};
-use crate::{Client, Connection, Error};
+use crate::Error;
 use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
 use futures_util::{ready, Sink, SinkExt, Stream, TryStreamExt};
 use postgres_protocol2::authentication;
 use postgres_protocol2::authentication::sasl;
 use postgres_protocol2::authentication::sasl::ScramSha256;
-use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message};
+use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody};
 use postgres_protocol2::message::frontend;
-use std::collections::{HashMap, VecDeque};
+use std::collections::HashMap;
 use std::io;
 use std::pin::Pin;
 use std::task::{Context, Poll};
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio::sync::mpsc;
 use tokio_util::codec::Framed;
 
 pub struct StartupStream<S, T> {
     inner: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
     buf: BackendMessages,
-    delayed: VecDeque<BackendMessage>,
+    delayed_notice: Vec<NoticeResponseBody>,
 }
 
 impl<S, T> Sink<FrontendMessage> for StartupStream<S, T>
@@ -78,11 +77,19 @@ where
     }
 }
 
+pub struct RawConnection<S, T> {
+    pub stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
+    pub parameters: HashMap<String, String>,
+    pub delayed_notice: Vec<NoticeResponseBody>,
+    pub process_id: i32,
+    pub secret_key: i32,
+}
+
 pub async fn connect_raw<S, T>(
     stream: S,
     tls: T,
     config: &Config,
-) -> Result<(Client, Connection<S, T::Stream>), Error>
+) -> Result<RawConnection<S, T::Stream>, Error>
 where
     S: AsyncRead + AsyncWrite + Unpin,
     T: TlsConnect<S>,
@@ -97,18 +104,20 @@ where
             },
         ),
         buf: BackendMessages::empty(),
-        delayed: VecDeque::new(),
+        delayed_notice: Vec::new(),
     };
 
     startup(&mut stream, config).await?;
     authenticate(&mut stream, config).await?;
     let (process_id, secret_key, parameters) = read_info(&mut stream).await?;
 
-    let (sender, receiver) = mpsc::unbounded_channel();
-    let client = Client::new(sender, config.ssl_mode, process_id, secret_key);
-    let connection = Connection::new(stream.inner, stream.delayed, parameters, receiver);
-
-    Ok((client, connection))
+    Ok(RawConnection {
+        stream: stream.inner,
+        parameters,
+        delayed_notice: stream.delayed_notice,
+        process_id,
+        secret_key,
+    })
 }
 
 async fn startup<S, T>(stream: &mut StartupStream<S, T>, config: &Config) -> Result<(), Error>
@@ -347,9 +356,7 @@ where
                     body.value().map_err(Error::parse)?.to_string(),
                 );
             }
-            Some(msg @ Message::NoticeResponse(_)) => {
-                stream.delayed.push_back(BackendMessage::Async(msg))
-            }
+            Some(Message::NoticeResponse(body)) => stream.delayed_notice.push(body),
             Some(Message::ReadyForQuery(_)) => return Ok((process_id, secret_key, parameters)),
             Some(Message::ErrorResponse(body)) => return Err(Error::db(body)),
             Some(_) => return Err(Error::unexpected_message()),
diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs
index 72ba8172b2..57c639a7de 100644
--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -1,9 +1,10 @@
 //! An asynchronous, pipelined, PostgreSQL client.
-#![warn(rust_2018_idioms, clippy::all, missing_docs)]
+#![warn(rust_2018_idioms, clippy::all)]
 
 pub use crate::cancel_token::CancelToken;
-pub use crate::client::Client;
+pub use crate::client::{Client, SocketConfig};
 pub use crate::config::Config;
+pub use crate::connect_raw::RawConnection;
 pub use crate::connection::Connection;
 use crate::error::DbError;
 pub use crate::error::Error;
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 2abe88ac88..b689b97a21 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -6,6 +6,7 @@ use std::time::Duration;
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
+use postgres_protocol::message::backend::NoticeResponseBody;
 use pq_proto::StartupMessageParams;
 use rustls::client::danger::ServerCertVerifier;
 use rustls::crypto::ring;
@@ -13,6 +14,7 @@ use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::tls::MakeTlsConnect;
+use tokio_postgres::{CancelToken, RawConnection};
 use tracing::{debug, error, info, warn};
 
 use crate::auth::parse_endpoint_param;
@@ -277,6 +279,8 @@ pub(crate) struct PostgresConnection {
     pub(crate) cancel_closure: CancelClosure,
     /// Labels for proxy's metrics.
     pub(crate) aux: MetricsAuxInfo,
+    /// Notices received from compute after authenticating
+    pub(crate) delayed_notice: Vec<NoticeResponseBody>,
 
     _guage: NumDbConnectionsGuard<'static>,
 }
@@ -322,10 +326,19 @@ impl ConnCfg {
 
         // connect_raw() will not use TLS if sslmode is "disable"
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
-        let (client, connection) = self.0.connect_raw(stream, tls).await?;
+        let connection = self.0.connect_raw(stream, tls).await?;
         drop(pause);
-        tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
-        let stream = connection.stream.into_inner();
+
+        let RawConnection {
+            stream,
+            parameters,
+            delayed_notice,
+            process_id,
+            secret_key,
+        } = connection;
+
+        tracing::Span::current().record("pid", tracing::field::display(process_id));
+        let stream = stream.into_inner();
 
         // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?)
         info!(
@@ -334,18 +347,23 @@ impl ConnCfg {
             self.0.get_ssl_mode()
         );
 
-        // This is very ugly but as of now there's no better way to
-        // extract the connection parameters from tokio-postgres' connection.
-        // TODO: solve this problem in a more elegant manner (e.g. the new library).
-        let params = connection.parameters;
-
         // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
         // Yet another reason to rework the connection establishing code.
-        let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token(), vec![]);
+        let cancel_closure = CancelClosure::new(
+            socket_addr,
+            CancelToken {
+                socket_config: None,
+                ssl_mode: self.0.get_ssl_mode(),
+                process_id,
+                secret_key,
+            },
+            vec![],
+        );
 
         let connection = PostgresConnection {
             stream,
-            params,
+            params: parameters,
+            delayed_notice,
             cancel_closure,
             aux,
             _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()),
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 956036d29d..af97fb3d71 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -384,11 +384,13 @@ pub(crate) async fn prepare_client_connection<P>(
     // The new token (cancel_key_data) will be sent to the client.
     let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone());
 
+    // Forward all deferred notices to the client.
+    for notice in &node.delayed_notice {
+        stream.write_message_noflush(&Be::Raw(b'N', notice.as_bytes()))?;
+    }
+
     // Forward all postgres connection params to the client.
-    // Right now the implementation is very hacky and inefficent (ideally,
-    // we don't need an intermediate hashmap), but at least it should be correct.
     for (name, value) in &node.params {
-        // TODO: Theoretically, this could result in a big pile of params...
         stream.write_message_noflush(&Be::ParameterStatus {
             name: name.as_bytes(),
             value: value.as_bytes(),
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 2c2c2964b6..15be6c9724 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -233,7 +233,7 @@ async fn handshake_tls() -> anyhow::Result<()> {
         generate_tls_config("generic-project-name.localhost", "localhost")?;
     let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
 
-    let (_client, _conn) = tokio_postgres::Config::new()
+    let _conn = tokio_postgres::Config::new()
         .user("john_doe")
         .dbname("earth")
         .ssl_mode(SslMode::Require)
@@ -249,7 +249,7 @@ async fn handshake_raw() -> anyhow::Result<()> {
 
     let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth));
 
-    let (_client, _conn) = tokio_postgres::Config::new()
+    let _conn = tokio_postgres::Config::new()
         .user("john_doe")
         .dbname("earth")
         .options("project=generic-project-name")
@@ -296,7 +296,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
         Scram::new(password).await?,
     ));
 
-    let (_client, _conn) = tokio_postgres::Config::new()
+    let _conn = tokio_postgres::Config::new()
         .channel_binding(tokio_postgres::config::ChannelBinding::Require)
         .user("user")
         .dbname("db")
@@ -320,7 +320,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
         Scram::new("password").await?,
     ));
 
-    let (_client, _conn) = tokio_postgres::Config::new()
+    let _conn = tokio_postgres::Config::new()
         .channel_binding(tokio_postgres::config::ChannelBinding::Disable)
         .user("user")
         .dbname("db")

From 4f6c5949739d7d85a0f3535494a68a345b9e7b6d Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 2 Dec 2024 12:46:07 +0000
Subject: [PATCH 156/209] CI(replication-tests): fix notifications about
 replication-tests failures (#9950)

## Problem

`if: ${{ github.event.schedule }}` gets skipped if a previous step has
failed, but we want to run the step for both `success` and `failure`

## Summary of changes
- Add `!cancelled()` to notification step if-condition, to skip only
cancelled jobs
---
 .github/workflows/benchmarking.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index ea8fee80c2..7621d72f64 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -249,7 +249,7 @@ jobs:
 
     # Post both success and failure to the Slack channel
     - name: Post to a Slack channel
-      if: ${{ github.event.schedule }}
+      if: ${{ github.event.schedule && !cancelled() }}
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream

From b6f93dcec94ec6481ef4a427c5ca1b349216967f Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 2 Dec 2024 16:38:12 +0100
Subject: [PATCH 157/209] proxy: Create Elasticache credentials provider lazily
 (#9967)

## Problem

The credentials providers tries to connect to AWS STS even when we use
plain Redis connections.

## Summary of changes

* Construct the CredentialsProvider only when needed ("irsa").
---
 proxy/src/bin/proxy.rs         | 49 +++++---------------------------
 proxy/src/redis/elasticache.rs | 51 ++++++++++++++++++++++++++++++----
 2 files changed, 53 insertions(+), 47 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index b772a987ee..c929b97d78 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -3,14 +3,6 @@ use std::pin::pin;
 use std::sync::Arc;
 
 use anyhow::bail;
-use aws_config::environment::EnvironmentVariableCredentialsProvider;
-use aws_config::imds::credentials::ImdsCredentialsProvider;
-use aws_config::meta::credentials::CredentialsProviderChain;
-use aws_config::meta::region::RegionProviderChain;
-use aws_config::profile::ProfileFileCredentialsProvider;
-use aws_config::provider_config::ProviderConfig;
-use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
-use aws_config::Region;
 use futures::future::Either;
 use proxy::auth::backend::jwt::JwkCache;
 use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
@@ -314,39 +306,7 @@ async fn main() -> anyhow::Result<()> {
     };
     info!("Using region: {}", args.aws_region);
 
-    let region_provider =
-        RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone()));
-    let provider_conf =
-        ProviderConfig::without_region().with_region(region_provider.region().await);
-    let aws_credentials_provider = {
-        // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
-        CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new())
-            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
-            .or_else(
-                "profile-sso",
-                ProfileFileCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build(),
-            )
-            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
-            // needed to access remote extensions bucket
-            .or_else(
-                "token",
-                WebIdentityTokenCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build(),
-            )
-            // uses imds v2
-            .or_else("imds", ImdsCredentialsProvider::builder().build())
-    };
-    let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new(
-        elasticache::AWSIRSAConfig::new(
-            args.aws_region.clone(),
-            args.redis_cluster_name,
-            args.redis_user_id,
-        ),
-        aws_credentials_provider,
-    ));
+    // TODO: untangle the config args
     let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) {
         ("plain", redis_url) => match redis_url {
             None => {
@@ -361,7 +321,12 @@ async fn main() -> anyhow::Result<()> {
                 ConnectionWithCredentialsProvider::new_with_credentials_provider(
                     host.to_string(),
                     port,
-                    elasticache_credentials_provider.clone(),
+                    elasticache::CredentialsProvider::new(
+                        args.aws_region,
+                        args.redis_cluster_name,
+                        args.redis_user_id,
+                    )
+                    .await,
                 ),
             ),
             (None, None) => {
diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs
index d118c8f412..bf6dde9332 100644
--- a/proxy/src/redis/elasticache.rs
+++ b/proxy/src/redis/elasticache.rs
@@ -1,6 +1,14 @@
+use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 
+use aws_config::environment::EnvironmentVariableCredentialsProvider;
+use aws_config::imds::credentials::ImdsCredentialsProvider;
 use aws_config::meta::credentials::CredentialsProviderChain;
+use aws_config::meta::region::RegionProviderChain;
+use aws_config::profile::ProfileFileCredentialsProvider;
+use aws_config::provider_config::ProviderConfig;
+use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
+use aws_config::Region;
 use aws_sdk_iam::config::ProvideCredentials;
 use aws_sigv4::http_request::{
     self, SignableBody, SignableRequest, SignatureLocation, SigningSettings,
@@ -45,12 +53,45 @@ pub struct CredentialsProvider {
 }
 
 impl CredentialsProvider {
-    pub fn new(config: AWSIRSAConfig, credentials_provider: CredentialsProviderChain) -> Self {
-        CredentialsProvider {
-            config,
-            credentials_provider,
-        }
+    pub async fn new(
+        aws_region: String,
+        redis_cluster_name: Option<String>,
+        redis_user_id: Option<String>,
+    ) -> Arc<CredentialsProvider> {
+        let region_provider =
+            RegionProviderChain::default_provider().or_else(Region::new(aws_region.clone()));
+        let provider_conf =
+            ProviderConfig::without_region().with_region(region_provider.region().await);
+        let aws_credentials_provider = {
+            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+            CredentialsProviderChain::first_try(
+                "env",
+                EnvironmentVariableCredentialsProvider::new(),
+            )
+            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
+            .or_else(
+                "profile-sso",
+                ProfileFileCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
+            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
+            // needed to access remote extensions bucket
+            .or_else(
+                "token",
+                WebIdentityTokenCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
+            // uses imds v2
+            .or_else("imds", ImdsCredentialsProvider::builder().build())
+        };
+        Arc::new(CredentialsProvider {
+            config: AWSIRSAConfig::new(aws_region, redis_cluster_name, redis_user_id),
+            credentials_provider: aws_credentials_provider,
+        })
     }
+
     pub(crate) async fn provide_credentials(&self) -> anyhow::Result<(String, String)> {
         let aws_credentials = self
             .credentials_provider

From 84687b743d2323cb289321c2cc290798fe71b7b5 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Mon, 2 Dec 2024 19:10:44 +0300
Subject: [PATCH 158/209] Update consensus protocol spec (#9607)

The spec was written for the buggy protocol which we had before the one
more similar to Raft was implemented. Update the spec with what we
currently have.

ref https://github.com/neondatabase/neon/issues/8699
---
 safekeeper/spec/.gitignore                    |    3 +
 safekeeper/spec/MCProposerAcceptorStatic.tla  |   31 +
 safekeeper/spec/ProposerAcceptorConsensus.cfg |   34 -
 safekeeper/spec/ProposerAcceptorConsensus.tla |  363 ----
 safekeeper/spec/ProposerAcceptorStatic.tla    |  449 +++++
 safekeeper/spec/modelcheck.sh                 |   49 +
 .../MCProposerAcceptorStatic_p2_a3_t2_l2.cfg  |   19 +
 .../MCProposerAcceptorStatic_p2_a3_t3_l2.cfg  |   19 +
 .../MCProposerAcceptorStatic_p2_a3_t3_l3.cfg  |   17 +
 .../MCProposerAcceptorStatic_p2_a3_t4_l4.cfg  |   17 +
 .../MCProposerAcceptorStatic_p2_a5_t2_l2.cfg  |   16 +
 .../MCProposerAcceptorStatic_p2_a5_t3_l3.cfg  |   16 +
 .../MCProposerAcceptorStatic_p2_a5_t4_l3.cfg  |   16 +
 safekeeper/spec/readme.md                     |   12 +
 ...c_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log |   63 +
 ...c_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log |   69 +
 ...c_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log |   72 +
 ...c_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log | 1466 +++++++++++++++++
 ...c_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log | 1374 +++++++++++++++
 ...c_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log |   89 +
 20 files changed, 3797 insertions(+), 397 deletions(-)
 create mode 100644 safekeeper/spec/.gitignore
 create mode 100644 safekeeper/spec/MCProposerAcceptorStatic.tla
 delete mode 100644 safekeeper/spec/ProposerAcceptorConsensus.cfg
 delete mode 100644 safekeeper/spec/ProposerAcceptorConsensus.tla
 create mode 100644 safekeeper/spec/ProposerAcceptorStatic.tla
 create mode 100755 safekeeper/spec/modelcheck.sh
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg
 create mode 100644 safekeeper/spec/readme.md
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log

diff --git a/safekeeper/spec/.gitignore b/safekeeper/spec/.gitignore
new file mode 100644
index 0000000000..7233153039
--- /dev/null
+++ b/safekeeper/spec/.gitignore
@@ -0,0 +1,3 @@
+*TTrace*
+*.toolbox/
+states/
diff --git a/safekeeper/spec/MCProposerAcceptorStatic.tla b/safekeeper/spec/MCProposerAcceptorStatic.tla
new file mode 100644
index 0000000000..be3d99c697
--- /dev/null
+++ b/safekeeper/spec/MCProposerAcceptorStatic.tla
@@ -0,0 +1,31 @@
+---- MODULE MCProposerAcceptorStatic ----
+EXTENDS TLC, ProposerAcceptorStatic
+
+\* Augments the spec with model checking constraints.
+
+\* For model checking.
+CONSTANTS
+  max_entries, \* model constraint: max log entries acceptor/proposer can hold
+  max_term \* model constraint: max allowed term
+
+ASSUME max_entries \in Nat /\ max_term \in Nat
+
+\* Model space constraint.
+StateConstraint == \A p \in proposers:
+                    /\ prop_state[p].term <= max_term
+                    /\ Len(prop_state[p].wal) <= max_entries
+\* Sets of proposers and acceptors are symmetric because we don't take any
+\* actions depending on some concrete proposer/acceptor (like IF p = p1 THEN
+\* ...)
+ProposerAcceptorSymmetry == Permutations(proposers) \union Permutations(acceptors)
+
+\* enforce order of the vars in the error trace with ALIAS
+\* Note that ALIAS is supported only since version 1.8.0 which is pre-release
+\* as of writing this.
+Alias == [
+           prop_state |-> prop_state,
+           acc_state |-> acc_state,
+           committed |-> committed
+         ]
+
+====
diff --git a/safekeeper/spec/ProposerAcceptorConsensus.cfg b/safekeeper/spec/ProposerAcceptorConsensus.cfg
deleted file mode 100644
index 989c86e47d..0000000000
--- a/safekeeper/spec/ProposerAcceptorConsensus.cfg
+++ /dev/null
@@ -1,34 +0,0 @@
-\* MV CONSTANT declarations
-CONSTANT NULL = NULL
-CONSTANTS
-p1 = p1
-p2 = p2
-p3 = p3
-a1 = a1
-a2 = a2
-a3 = a3
-\* MV CONSTANT definitions
-CONSTANT
-proposers = {p1, p2}
-acceptors = {a1, a2, a3}
-\* SYMMETRY definition
-SYMMETRY perms
-\* CONSTANT definitions
-CONSTANT
-max_term = 3
-CONSTANT
-max_entries = 3
-\* INIT definition
-INIT
-Init
-\* NEXT definition
-NEXT
-Next
-\* INVARIANT definition
-INVARIANT
-TypeOk
-ElectionSafety
-LogIsMonotonic
-LogSafety
-CommittedNotOverwritten
-CHECK_DEADLOCK FALSE
\ No newline at end of file
diff --git a/safekeeper/spec/ProposerAcceptorConsensus.tla b/safekeeper/spec/ProposerAcceptorConsensus.tla
deleted file mode 100644
index e5f0bb270f..0000000000
--- a/safekeeper/spec/ProposerAcceptorConsensus.tla
+++ /dev/null
@@ -1,363 +0,0 @@
----- MODULE ProposerAcceptorConsensus ----
-
-\* Differences from current implementation:
-\* - unified not-globally-unique epoch & term (node_id)
-\* Simplifications:
-\* - instant message delivery
-\* - feedback is not modeled separately, commit_lsn is updated directly
-
-EXTENDS Integers, Sequences, FiniteSets, TLC
-
-VARIABLES
-  prop_state, \* prop_state[p] is state of proposer p
-  acc_state, \* acc_state[a] is state of acceptor a
-  commit_lsns \* map of acceptor -> commit_lsn
-
-CONSTANT
-  acceptors,
-  proposers,
-  max_entries, \* model constraint: max log entries acceptor/proposer can hold
-  max_term \* model constraint: max allowed term
-
-CONSTANT NULL
-
-ASSUME max_entries \in Nat /\ max_term \in Nat
-
-\* For specifying symmetry set in manual cfg file, see
-\* https://github.com/tlaplus/tlaplus/issues/404
-perms == Permutations(proposers) \union Permutations(acceptors)
-
-\********************************************************************************
-\* Helpers
-\********************************************************************************
-
-Maximum(S) ==
-  (*************************************************************************)
-  (* If S is a set of numbers, then this define Maximum(S) to be the       *)
-  (* maximum of those numbers, or -1 if S is empty.                        *)
-  (*************************************************************************)
-  IF S = {} THEN -1
-            ELSE CHOOSE n \in S : \A m \in S : n \geq m
-
-\* minimum of numbers in the set, error if set is empty
-Minimum(S) ==
-  CHOOSE min \in S : \A n \in S : min <= n
-
-\* Min of two numbers
-Min(a, b) == IF a < b THEN a ELSE b
-
-\* Set of values of function f. XXX is there a such builtin?
-FValues(f) == {f[a] : a \in DOMAIN f}
-
-\* Sort of 0 for functions
-EmptyF == [x \in {} |-> 42]
-IsEmptyF(f) == DOMAIN f = {}
-
-\* Next entry proposer p will push to acceptor a or NULL.
-NextEntry(p, a) ==
-  IF Len(prop_state[p].wal) >= prop_state[p].next_send_lsn[a] THEN
-    CHOOSE r \in FValues(prop_state[p].wal) : r.lsn = prop_state[p].next_send_lsn[a]
-  ELSE
-    NULL
-
-
-\*****************
-
-NumAccs == Cardinality(acceptors)
-
-\* does acc_set form the quorum?
-Quorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2 + 1)
-\* all quorums of acceptors
-Quorums == {subset \in SUBSET acceptors: Quorum(subset)}
-
-\* flush_lsn of acceptor a.
-FlushLsn(a) == Len(acc_state[a].wal)
-
-
-\********************************************************************************
-\* Type assertion
-\********************************************************************************
-\* Defining sets of all possible tuples and using them in TypeOk in usual
-\* all-tuples constructor is not practical because such definitions force
-\* TLC to enumerate them, while they are are horribly enormous
-\* (TLC screams "Attempted to construct a set with too many elements").
-\* So instead check types manually.
-TypeOk ==
-    /\ \A p \in proposers:
-      /\ DOMAIN prop_state[p] = {"state", "term", "votes", "donor_epoch", "vcl", "wal", "next_send_lsn"}
-      \* in campaign proposer sends RequestVote and waits for acks;
-      \* in leader he is elected
-      /\ prop_state[p].state \in {"campaign", "leader"}
-      \* 0..max_term should be actually Nat in the unbounded model, but TLC won't
-      \* swallow it
-      /\ prop_state[p].term \in 0..max_term
-      \* votes received
-      /\ \A voter \in DOMAIN prop_state[p].votes:
-         /\ voter \in acceptors
-         /\ prop_state[p].votes[voter] \in [epoch: 0..max_term, flush_lsn: 0..max_entries]
-      /\ prop_state[p].donor_epoch \in 0..max_term
-      \* wal is sequence of just <lsn, epoch of author> records
-      /\ \A i \in DOMAIN prop_state[p].wal:
-           prop_state[p].wal[i] \in [lsn: 1..max_entries, epoch: 1..max_term]
-      \* Following implementation, we skew the original Aurora meaning of this;
-      \* here it is lsn of highest definitely committed record as set by proposer
-      \* when it is elected; it doesn't change since then
-      /\ prop_state[p].vcl \in 0..max_entries
-      \* map of acceptor -> next lsn to send
-      /\ \A a \in DOMAIN prop_state[p].next_send_lsn:
-         /\ a \in acceptors
-         /\ prop_state[p].next_send_lsn[a] \in 1..(max_entries + 1)
-    /\ \A a \in acceptors:
-      /\ DOMAIN acc_state[a] = {"term", "epoch", "wal"}
-      /\ acc_state[a].term \in 0..max_term
-      /\ acc_state[a].epoch \in 0..max_term
-      /\ \A i \in DOMAIN acc_state[a].wal:
-           acc_state[a].wal[i] \in [lsn: 1..max_entries, epoch: 1..max_term]
-    /\ \A a \in DOMAIN commit_lsns:
-      /\ a \in acceptors
-      /\ commit_lsns[a] \in 0..max_entries
-
-\********************************************************************************
-\* Initial
-\********************************************************************************
-
-Init ==
-  /\ prop_state = [p \in proposers |-> [
-                      state |-> "campaign",
-                      term |-> 1,
-                      votes |-> EmptyF,
-                      donor_epoch |-> 0,
-                      vcl |-> 0,
-                      wal |-> << >>,
-                      next_send_lsn |-> EmptyF
-                  ]]
-  /\ acc_state = [a \in acceptors |-> [
-                    \* there will be no leader in this term, 1 is the first real
-                    term |-> 0,
-                    epoch |-> 0,
-                    wal |-> << >>
-                 ]]
-  /\ commit_lsns = [a \in acceptors |-> 0]
-
-
-\********************************************************************************
-\* Actions
-\********************************************************************************
-
-\* Proposer loses all state.
-\* For simplicity (and to reduct state space), we assume it immediately gets
-\* current state from quorum q of acceptors determining the term he will request
-\* to vote for.
-RestartProposer(p, q) ==
-  /\ Quorum(q)
-  /\ LET
-       new_term == Maximum({acc_state[a].term : a \in q}) + 1
-     IN
-       /\ new_term <= max_term
-       /\ prop_state' = [prop_state EXCEPT ![p].state = "campaign",
-                                           ![p].term = new_term,
-                                           ![p].votes = EmptyF,
-                                           ![p].donor_epoch = 0,
-                                           ![p].vcl = 0,
-                                           ![p].wal = << >>,
-                                           ![p].next_send_lsn = EmptyF]
-       /\ UNCHANGED <<acc_state, commit_lsns>>
-
-\* Acceptor a immediately votes for proposer p.
-Vote(p, a) ==
- /\ prop_state[p].state = "campaign"
- /\ acc_state[a].term < prop_state[p].term \* main voting condition
- /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term]
- /\ LET
-      vote == [epoch |-> acc_state[a].epoch, flush_lsn |-> FlushLsn(a)]
-    IN
-      prop_state' = [prop_state EXCEPT ![p].votes = prop_state[p].votes @@ (a :> vote)]
- /\ UNCHANGED <<commit_lsns>>
-
-
-\* Proposer p gets elected.
-BecomeLeader(p) ==
-  /\ prop_state[p].state = "campaign"
-  /\ Quorum(DOMAIN prop_state[p].votes)
-  /\ LET
-       max_epoch == Maximum({v.epoch : v \in FValues(prop_state[p].votes)})
-       max_epoch_votes == {v \in FValues(prop_state[p].votes) : v.epoch = max_epoch}
-       donor == CHOOSE dv \in DOMAIN prop_state[p].votes :
-                     /\ prop_state[p].votes[dv].epoch = max_epoch
-                     /\ \A v \in max_epoch_votes:
-                       prop_state[p].votes[dv].flush_lsn >= v.flush_lsn
-       max_vote == prop_state[p].votes[donor]
-       \* Establish lsn to stream from for voters.
-       \* At some point it seemed like we can regard log as correct and only
-       \* append to it if has in the max_epoch, however TLC showed that's not
-       \* the case; we must always stream since first not matching record.
-       next_send_lsn == [voter \in DOMAIN prop_state[p].votes |-> 1]
-     IN
-          \* we fetch log from the most advanced node (this is separate
-          \* roundtrip), make sure node is still on one term with us
-       /\ acc_state[donor].term = prop_state[p].term
-       /\ prop_state' = [prop_state EXCEPT ![p].state = "leader",
-                                           \* fetch the log from donor
-                                           ![p].wal = acc_state[donor].wal,
-                                           ![p].donor_epoch = max_epoch,
-                                           ![p].vcl = max_vote.flush_lsn,
-                                           ![p].next_send_lsn = next_send_lsn]
-       /\ UNCHANGED <<acc_state, commit_lsns>>
-
-
-\* acceptor a learns about elected proposer p's term.
-UpdateTerm(p, a) ==
-  /\ prop_state[p].state = "leader"
-  /\ acc_state[a].term < prop_state[p].term
-  /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term]
-  /\ UNCHANGED <<prop_state, commit_lsns>>
-
-
-\* Acceptor a which didn't participate in voting connects to elected proposer p
-\* and p sets the streaming point
-HandshakeWithLeader(p, a) ==
-  /\ prop_state[p].state = "leader"
-  /\ acc_state[a].term = prop_state[p].term
-  /\ a \notin DOMAIN prop_state[p].next_send_lsn
-  /\ LET
-       next_send_lsn == prop_state[p].next_send_lsn @@ (a :> 1)
-     IN
-       prop_state' = [prop_state EXCEPT ![p].next_send_lsn = next_send_lsn]
-  /\ UNCHANGED <<acc_state, commit_lsns>>
-
-
-\* Append new log entry to elected proposer
-NewEntry(p) ==
-  /\ prop_state[p].state = "leader"
-  /\ Len(prop_state[p].wal) < max_entries \* model constraint
-  /\ LET
-       new_lsn == IF Len(prop_state[p].wal) = 0 THEN
-                    prop_state[p].vcl + 1
-                  ELSE
-                    \* lsn of last record + 1
-                    prop_state[p].wal[Len(prop_state[p].wal)].lsn + 1
-       new_entry == [lsn |-> new_lsn, epoch |-> prop_state[p].term]
-     IN
-       /\ prop_state' = [prop_state EXCEPT ![p].wal = Append(prop_state[p].wal, new_entry)]
-       /\ UNCHANGED <<acc_state, commit_lsns>>
-
-
-\* Write entry new_e to log wal, rolling back all higher entries if e is different.
-\* If bump_epoch is TRUE, it means we get record with lsn=vcl and going to update
-\* the epoch. Truncate log in this case as well, as we might have correct <= vcl
-\* part and some outdated entries behind it which we want to purge before
-\* declaring us as recovered. Another way to accomplish this (in previous commit)
-\* is wait for first-entry-from-new-epoch before bumping it.
-WriteEntry(wal, new_e, bump_epoch) ==
-  (new_e.lsn :> new_e) @@
-  \* If wal has entry with such lsn and it is different, truncate all higher log.
-  IF \/ (new_e.lsn \in DOMAIN wal /\ wal[new_e.lsn] /= new_e)
-     \/ bump_epoch THEN
-    SelectSeq(wal, LAMBDA e: e.lsn < new_e.lsn)
-  ELSE
-    wal
-
-
-\* Try to transfer entry from elected proposer p to acceptor a
-TransferEntry(p, a) ==
-  /\ prop_state[p].state = "leader"
-  /\ prop_state[p].term = acc_state[a].term
-  /\ a \in DOMAIN prop_state[p].next_send_lsn
-  /\ LET
-       next_e == NextEntry(p, a)
-     IN
-       /\ next_e /= NULL
-       /\ LET
-            \* Consider bumping epoch if getting this entry recovers the acceptor,
-            \* that is, we reach first record behind VCL.
-            new_epoch ==
-              IF /\ acc_state[a].epoch < prop_state[p].term
-                 /\ next_e.lsn >= prop_state[p].vcl
-              THEN
-                prop_state[p].term
-              ELSE
-                acc_state[a].epoch
-            \* Also check whether this entry allows to advance commit_lsn and
-            \* if so, bump it where possible. Modeling this as separate action
-            \* significantly bloats the space (5m vs 15m on max_entries=3 max_term=3,
-            \* so act immediately.
-            entry_owners == {o \in acceptors:
-                               /\ o /= a
-                               \* only recovered acceptors advance commit_lsn
-                               /\ acc_state[o].epoch = prop_state[p].term
-                               /\ next_e \in FValues(acc_state[o].wal)} \cup {a}
-          IN
-            /\ acc_state' = [acc_state EXCEPT ![a].wal = WriteEntry(acc_state[a].wal, next_e, new_epoch /= acc_state[a].epoch),
-                                              ![a].epoch = new_epoch]
-            /\ prop_state' = [prop_state EXCEPT ![p].next_send_lsn[a] =
-                                                  prop_state[p].next_send_lsn[a] + 1]
-            /\ commit_lsns' = IF /\ new_epoch = prop_state[p].term
-                                 /\ Quorum(entry_owners)
-                              THEN
-                                [acc \in acceptors |->
-                                   IF /\ acc \in entry_owners
-                                      /\ next_e.lsn > commit_lsns[acc]
-                                   THEN
-                                     next_e.lsn
-                                   ELSE
-                                       commit_lsns[acc]]
-                              ELSE
-                                commit_lsns
-
-
-\*******************************************************************************
-\* Final spec
-\*******************************************************************************
-
-Next ==
-  \/ \E q \in Quorums: \E p \in proposers: RestartProposer(p, q)
-  \/ \E p \in proposers: \E a \in acceptors: Vote(p, a)
-  \/ \E p \in proposers: BecomeLeader(p)
-  \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a)
-  \/ \E p \in proposers: \E a \in acceptors: HandshakeWithLeader(p, a)
-  \/ \E p \in proposers: NewEntry(p)
-  \/ \E p \in proposers: \E a \in acceptors: TransferEntry(p, a)
-
-Spec == Init /\ [][Next]_<<prop_state, acc_state, commit_lsns>>
-
-
-\********************************************************************************
-\* Invariants
-\********************************************************************************
-
-\* we don't track history, but this property is fairly convincing anyway
-ElectionSafety ==
-  \A p1, p2 \in proposers:
-    (/\ prop_state[p1].state = "leader"
-     /\ prop_state[p2].state = "leader"
-     /\ prop_state[p1].term = prop_state[p2].term) => (p1 = p2)
-
-LogIsMonotonic ==
-  \A a \in acceptors:
-    \A i \in DOMAIN acc_state[a].wal: \A j \in DOMAIN acc_state[a].wal:
-      (i > j) => (/\ acc_state[a].wal[i].lsn > acc_state[a].wal[j].lsn
-                  /\ acc_state[a].wal[i].epoch >= acc_state[a].wal[j].epoch)
-
-\* Main invariant: log under commit_lsn must match everywhere.
-LogSafety ==
-  \A a1 \in acceptors: \A a2 \in acceptors:
-    LET
-      common_len == Min(commit_lsns[a1], commit_lsns[a2])
-    IN
-      SubSeq(acc_state[a1].wal, 1, common_len) = SubSeq(acc_state[a2].wal, 1, common_len)
-
-\* Next record we are going to push to acceptor must never overwrite committed
-\* different record.
-CommittedNotOverwritten ==
-  \A p \in proposers: \A a \in acceptors:
-    (/\ prop_state[p].state = "leader"
-     /\ prop_state[p].term = acc_state[a].term
-     /\ a \in DOMAIN prop_state[p].next_send_lsn) =>
-       LET
-         next_e == NextEntry(p, a)
-       IN
-         (next_e /= NULL) =>
-          ((commit_lsns[a] >= next_e.lsn) => (acc_state[a].wal[next_e.lsn] = next_e))
-
-
-====
\ No newline at end of file
diff --git a/safekeeper/spec/ProposerAcceptorStatic.tla b/safekeeper/spec/ProposerAcceptorStatic.tla
new file mode 100644
index 0000000000..b2d2f005db
--- /dev/null
+++ b/safekeeper/spec/ProposerAcceptorStatic.tla
@@ -0,0 +1,449 @@
+---- MODULE ProposerAcceptorStatic ----
+
+(*
+  The protocol is very similar to Raft. The key differences are:
+  - Leaders (proposers) are separated from storage nodes (acceptors), which has
+    been already an established way to think about Paxos.
+  - We don't want to stamp each log record with term, so instead carry around
+    term histories which are sequences of <term, LSN where term begins> pairs.
+    As a bonus (and subtlety) this allows the proposer to commit entries from
+    previous terms without writing new records -- if acceptor's log is caught
+    up, update of term history on it updates last_log_term as well.
+*)
+
+\* Model simplifications:
+\* - Instant message delivery. Notably, ProposerElected message (TruncateWal action) is not
+\*   delayed, so we don't attempt to truncate WAL when the same wp already appended something
+\*   on the acceptor since common point had been calculated (this should be rejected).
+\* - old WAL is immediately copied to proposer on its election, without on-demand fetch later.
+
+\* Some ideas how to break it to play around to get a feeling:
+\* - replace Quorums with BadQuorums.
+\* - remove 'don't commit entries from previous terms separately' rule in
+\*   CommitEntries and observe figure 8 from the raft paper.
+\*   With p2a3t4l4 32 steps error was found in 1h on 80 cores.
+
+EXTENDS Integers, Sequences, FiniteSets, TLC
+
+VARIABLES
+  prop_state, \* prop_state[p] is state of proposer p
+  acc_state, \* acc_state[a] is state of acceptor a
+  committed, \* bag (set) of ever committed <<term, lsn>> entries
+  elected_history \* counter for elected terms, see TypeOk for details
+
+CONSTANT
+  acceptors,
+  proposers
+
+CONSTANT NULL
+
+\********************************************************************************
+\* Helpers
+\********************************************************************************
+
+Maximum(S) ==
+  (*************************************************************************)
+  (* If S is a set of numbers, then this define Maximum(S) to be the       *)
+  (* maximum of those numbers, or -1 if S is empty.                        *)
+  (*************************************************************************)
+  IF S = {} THEN -1 ELSE CHOOSE n \in S : \A m \in S : n \geq m
+
+\* minimum of numbers in the set, error if set is empty
+Minimum(S) == CHOOSE min \in S : \A n \in S : min <= n
+
+\* Min of two numbers
+Min(a, b) == IF a < b THEN a ELSE b
+
+\* Sort of 0 for functions
+EmptyF == [x \in {} |-> 42]
+IsEmptyF(f) == DOMAIN f = {}
+
+\* Set of values (image) of the function f. Apparently no such builtin.
+Range(f) == {f[x] : x \in DOMAIN f}
+
+\* If key k is in function f, map it using l, otherwise insert v. Returns the
+\* updated function.
+Upsert(f, k, v, l(_)) ==
+    LET new_val == IF k \in DOMAIN f THEN l(f[k]) ELSE v IN
+        (k :> new_val) @@ f
+
+\*****************
+
+NumAccs == Cardinality(acceptors)
+
+\* does acc_set form the quorum?
+Quorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2 + 1)
+\* all quorums of acceptors
+Quorums == {subset \in SUBSET acceptors: Quorum(subset)}
+
+\* For substituting Quorums and seeing what happens.
+BadQuorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2)
+BadQuorums == {subset \in SUBSET acceptors: BadQuorum(subset)}
+
+\* flushLsn (end of WAL, i.e. index of next entry) of acceptor a.
+FlushLsn(a) == Len(acc_state[a].wal) + 1
+
+\* Typedefs. Note that TLA+ Nat includes zero.
+Terms == Nat
+Lsns == Nat
+
+\********************************************************************************
+\* Type assertion
+\********************************************************************************
+\* Defining sets of all possible tuples and using them in TypeOk in usual
+\* all-tuples constructor is not practical because such definitions force
+\* TLC to enumerate them, while they are are horribly enormous
+\* (TLC screams "Attempted to construct a set with too many elements").
+\* So instead check types manually.
+
+
+\* Term history is a sequence of <term, LSN where term begins> pairs.
+IsTermHistory(th) ==
+    \A th_entry \in Range(th): th_entry.term \in Terms /\ th_entry.lsn \in Lsns
+
+IsWal(w) ==
+    \A i \in DOMAIN w:
+        /\ i \in Lsns
+        /\ w[i] \in Terms
+
+TypeOk ==
+    /\ \A p \in proposers:
+        \* '_' in field names hinders pretty printing
+        \* https://github.com/tlaplus/tlaplus/issues/1051
+        \* so use camel case.
+        /\ DOMAIN prop_state[p] = {"state", "term", "votes", "termHistory", "wal", "nextSendLsn"}
+        \* In campaign proposer sends RequestVote and waits for acks;
+        \* in leader he is elected.
+        /\ prop_state[p].state \in {"campaign", "leader"}
+        \* term for which it will campaign, or won term in leader state
+        /\ prop_state[p].term \in Terms
+        \* votes received
+        /\ \A voter \in DOMAIN prop_state[p].votes: voter \in acceptors
+        /\ \A vote \in Range(prop_state[p].votes):
+               /\ IsTermHistory(vote.termHistory)
+               /\ vote.flushLsn \in Lsns
+        \* Proposer's term history. Empty while proposer is in "campaign".
+        /\ IsTermHistory(prop_state[p].termHistory)
+        \* In the model we identify WAL entries only by <term, LSN> pairs
+        \* without additional unique id, which is enough for its purposes.
+        \* It means that with term history fully modeled wal becomes
+        \* redundant as it can be computed from term history + WAL length.
+        \* However, we still keep it here and at acceptors as explicit sequence
+        \* where index is LSN and value is the term to avoid artificial mapping to
+        \* figure out real entries. It shouldn't bloat model much because this
+        \* doesn't increase number of distinct states.
+        /\ IsWal(prop_state[p].wal)
+        \* Map of acceptor -> next lsn to send. It is set when truncate_wal is
+        \* done so sending entries is allowed only after that. In the impl TCP
+        \* ensures this ordering.
+        /\ \A a \in DOMAIN prop_state[p].nextSendLsn:
+               /\ a \in acceptors
+               /\ prop_state[p].nextSendLsn[a] \in Lsns
+    /\ \A a \in acceptors:
+           /\ DOMAIN acc_state[a] = {"term", "termHistory", "wal"}
+           /\ acc_state[a].term \in Terms
+           /\ IsTermHistory(acc_state[a].termHistory)
+           /\ IsWal(acc_state[a].wal)
+    /\ \A c \in committed:
+           /\ c.term \in Terms
+           /\ c.lsn \in Lsns
+    \* elected_history is a retrospective map of term -> number of times it was
+    \* elected, for use in ElectionSafetyFull invariant. For static spec it is
+    \* fairly convincing that it holds, but with membership change it is less
+    \* trivial. And as we identify log entries only with <term, lsn>, importance
+    \* of it is quite high as violation of log safety might go undetected if
+    \* election safety is violated. Note though that this is not always the
+    \* case, i.e. you can imagine (and TLC should find) schedule where log
+    \* safety violation is still detected because two leaders with the same term
+    \* commit histories which are different in previous terms, so it is not that
+    \* crucial. Plus if spec allows ElectionSafetyFull violation, likely
+    \* ElectionSafety will also be violated in some schedules. But neither it
+    \* should bloat the model too much.
+    /\ \A term \in DOMAIN elected_history:
+           /\ term \in Terms
+           /\ elected_history[term] \in Nat
+
+\********************************************************************************
+\* Initial
+\********************************************************************************
+
+Init ==
+    /\ prop_state = [p \in proposers |-> [
+                        state |-> "campaign",
+                        term |-> 1,
+                        votes |-> EmptyF,
+                        termHistory |-> << >>,
+                        wal |-> << >>,
+                        nextSendLsn |-> EmptyF
+                    ]]
+    /\ acc_state = [a \in acceptors |-> [
+                       \* There will be no leader in zero term, 1 is the first
+                       \* real.
+                       term |-> 0,
+                       \* Again, leader in term 0 doesn't exist, but we initialize
+                       \* term histories with it to always have common point in
+                       \* them. Lsn is 1 because TLA+ sequences are indexed from 1
+                       \* (we don't want to truncate WAL out of range).
+                       termHistory |-> << [term |-> 0, lsn |-> 1] >>,
+                       wal |-> << >>
+                   ]]
+    /\ committed = {}
+    /\ elected_history = EmptyF
+
+
+\********************************************************************************
+\* Actions
+\********************************************************************************
+
+\* Proposer loses all state.
+\* For simplicity (and to reduct state space), we assume it immediately gets
+\* current state from quorum q of acceptors determining the term he will request
+\* to vote for.
+RestartProposer(p, q) ==
+    /\ Quorum(q)
+    /\ LET new_term == Maximum({acc_state[a].term : a \in q}) + 1 IN
+           /\ prop_state' = [prop_state EXCEPT ![p].state = "campaign",
+                                               ![p].term = new_term,
+                                               ![p].votes = EmptyF,
+                                               ![p].termHistory = << >>,
+                                               ![p].wal = << >>,
+                                               ![p].nextSendLsn = EmptyF]
+           /\ UNCHANGED <<acc_state, committed, elected_history>>
+
+\* Term history of acceptor a's WAL: the one saved truncated to contain only <=
+\* local FlushLsn entries.
+AcceptorTermHistory(a) ==
+    SelectSeq(acc_state[a].termHistory, LAMBDA th_entry: th_entry.lsn <= FlushLsn(a))
+
+\* Acceptor a immediately votes for proposer p.
+Vote(p, a) ==
+    /\ prop_state[p].state = "campaign"
+    /\ acc_state[a].term < prop_state[p].term \* main voting condition
+    /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term]
+    /\ LET
+           vote == [termHistory |-> AcceptorTermHistory(a), flushLsn |-> FlushLsn(a)]
+       IN
+           prop_state' = [prop_state EXCEPT ![p].votes = (a :> vote) @@ prop_state[p].votes]
+    /\ UNCHANGED <<committed, elected_history>>
+
+
+\* Get lastLogTerm from term history th.
+LastLogTerm(th) == th[Len(th)].term
+
+\* Proposer p gets elected.
+BecomeLeader(p) ==
+  /\ prop_state[p].state = "campaign"
+  /\ Quorum(DOMAIN prop_state[p].votes)
+  /\ LET
+         \* Find acceptor with the highest <last_log_term, lsn> vote.
+         max_vote_acc ==
+              CHOOSE a \in DOMAIN prop_state[p].votes:
+                  LET v == prop_state[p].votes[a]
+                  IN \A v2 \in Range(prop_state[p].votes):
+                         /\ LastLogTerm(v.termHistory) >= LastLogTerm(v2.termHistory)
+                         /\ (LastLogTerm(v.termHistory) = LastLogTerm(v2.termHistory) => v.flushLsn >= v2.flushLsn)
+         max_vote == prop_state[p].votes[max_vote_acc]
+         prop_th == Append(max_vote.termHistory, [term |-> prop_state[p].term, lsn |-> max_vote.flushLsn])
+     IN
+         \* We copy all log preceding proposer's term from the max vote node so
+         \* make sure it is still on one term with us. This is a model
+         \* simplification which can be removed, in impl we fetch WAL on demand
+         \* from safekeeper which has it later. Note though that in case of on
+         \* demand fetch we must check on donor not only term match, but that
+         \* truncate_wal had already been done (if it is not max_vote_acc).
+         /\ acc_state[max_vote_acc].term = prop_state[p].term
+         /\ prop_state' = [prop_state EXCEPT ![p].state = "leader",
+                                             ![p].termHistory = prop_th,
+                                             ![p].wal = acc_state[max_vote_acc].wal
+                          ]
+         /\ elected_history' = Upsert(elected_history, prop_state[p].term, 1, LAMBDA c: c + 1)
+         /\ UNCHANGED <<acc_state, committed>>
+
+
+\* Acceptor a learns about elected proposer p's term. In impl it matches to
+\* VoteRequest/VoteResponse exchange when leader is already elected and is not
+\* interested in the vote result.
+UpdateTerm(p, a) ==
+    /\ prop_state[p].state = "leader"
+    /\ acc_state[a].term < prop_state[p].term
+    /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term]
+    /\ UNCHANGED <<prop_state, committed, elected_history>>
+
+\* Find highest common point (LSN of the first divergent record) in the logs of
+\* proposer p and acceptor a. Returns <term, lsn> of the highest common point.
+FindHighestCommonPoint(prop_th, acc_th, acc_flush_lsn) ==
+    LET
+        \* First find index of the highest common term.
+        \* It must exist because we initialize th with <0, 1>.
+        last_common_idx == Maximum({i \in 1..Min(Len(prop_th), Len(acc_th)): prop_th[i].term = acc_th[i].term})
+        last_common_term == prop_th[last_common_idx].term
+        \* Now find where it ends at both prop and acc and take min. End of term
+        \* is the start of the next unless it is the last one; there it is
+        \* flush_lsn in case of acceptor. In case of proposer it is the current
+        \* writing position, but it can't be less than flush_lsn, so we
+        \* take flush_lsn.
+        acc_common_term_end == IF last_common_idx = Len(acc_th) THEN acc_flush_lsn ELSE acc_th[last_common_idx + 1].lsn
+        prop_common_term_end == IF last_common_idx = Len(prop_th) THEN acc_flush_lsn ELSE prop_th[last_common_idx + 1].lsn
+    IN
+        [term |-> last_common_term, lsn |-> Min(acc_common_term_end, prop_common_term_end)]
+
+\* Elected proposer p immediately truncates WAL (and term history) of acceptor a
+\* before starting streaming. Establishes nextSendLsn for a.
+\*
+\* In impl this happens at each reconnection, here we also allow to do it multiple times.
+TruncateWal(p, a) ==
+    /\ prop_state[p].state = "leader"
+    /\ acc_state[a].term = prop_state[p].term
+    /\ LET
+           hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a))
+           next_send_lsn == (a :> hcp.lsn) @@ prop_state[p].nextSendLsn
+       IN
+           \* Acceptor persists full history immediately; reads adjust it to the
+           \* really existing wal with AcceptorTermHistory.
+           /\ acc_state' = [acc_state EXCEPT ![a].termHistory = prop_state[p].termHistory,
+                                             \* note: SubSeq is inclusive, hence -1.
+                                             ![a].wal = SubSeq(acc_state[a].wal, 1, hcp.lsn - 1)
+                           ]
+           /\ prop_state' = [prop_state EXCEPT ![p].nextSendLsn = next_send_lsn]
+           /\ UNCHANGED <<committed, elected_history>>
+
+\* Append new log entry to elected proposer
+NewEntry(p) ==
+    /\ prop_state[p].state = "leader"
+    /\ LET
+           \* entry consists only of term, index serves as LSN.
+           new_entry == prop_state[p].term
+       IN
+           /\ prop_state' = [prop_state EXCEPT ![p].wal = Append(prop_state[p].wal, new_entry)]
+           /\ UNCHANGED <<acc_state, committed, elected_history>>
+
+\* Immediately append next entry from elected proposer to acceptor a.
+AppendEntry(p, a) ==
+    /\ prop_state[p].state = "leader"
+    /\ acc_state[a].term = prop_state[p].term
+    /\ a \in DOMAIN prop_state[p].nextSendLsn \* did TruncateWal
+    /\ prop_state[p].nextSendLsn[a] <= Len(prop_state[p].wal) \* have smth to send
+    /\ LET
+           send_lsn == prop_state[p].nextSendLsn[a]
+           entry == prop_state[p].wal[send_lsn]
+           \* Since message delivery is instant we don't check that send_lsn follows
+           \* the last acc record, it must always be true.
+       IN
+           /\ prop_state' = [prop_state EXCEPT ![p].nextSendLsn[a] = send_lsn + 1]
+           /\ acc_state' = [acc_state EXCEPT ![a].wal = Append(acc_state[a].wal, entry)]
+           /\ UNCHANGED <<committed, elected_history>>
+
+\* LSN where elected proposer p starts writing its records.
+PropStartLsn(p) ==
+    IF prop_state[p].state = "leader" THEN prop_state[p].termHistory[Len(prop_state[p].termHistory)].lsn ELSE NULL
+
+\* Proposer p commits all entries it can using quorum q. Note that unlike
+\* will62794/logless-reconfig this allows to commit entries from previous terms
+\* (when conditions for that are met).
+CommitEntries(p, q) ==
+    /\ prop_state[p].state = "leader"
+    /\ \A a \in q:
+           /\ acc_state[a].term = prop_state[p].term
+             \* nextSendLsn existence means TruncateWal has happened, it ensures
+             \* acceptor's WAL (and FlushLsn) are from proper proposer's history.
+             \* Alternatively we could compare LastLogTerm here, but that's closer to
+             \* what we do in the impl (we check flushLsn in AppendResponse, but
+             \* AppendRequest is processed only if HandleElected handling was good).
+           /\ a \in DOMAIN prop_state[p].nextSendLsn
+    \* Now find the LSN present on all the quorum.
+    /\ LET quorum_lsn == Minimum({FlushLsn(a): a \in q}) IN
+           \* This is the basic Raft rule of not committing entries from previous
+           \* terms except along with current term entry (commit them only when
+           \* quorum recovers, i.e. last_log_term on it reaches leader's term).
+           /\ quorum_lsn >= PropStartLsn(p)
+           /\ committed' = committed \cup {[term |-> prop_state[p].wal[lsn], lsn |-> lsn]: lsn \in 1..(quorum_lsn - 1)}
+           /\ UNCHANGED <<prop_state, acc_state, elected_history>>
+
+\*******************************************************************************
+\* Final spec
+\*******************************************************************************
+
+Next ==
+    \/ \E q \in Quorums: \E p \in proposers: RestartProposer(p, q)
+    \/ \E p \in proposers: \E a \in acceptors: Vote(p, a)
+    \/ \E p \in proposers: BecomeLeader(p)
+    \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a)
+    \/ \E p \in proposers: \E a \in acceptors: TruncateWal(p, a)
+    \/ \E p \in proposers: NewEntry(p)
+    \/ \E p \in proposers: \E a \in acceptors: AppendEntry(p, a)
+    \/ \E q \in Quorums: \E p \in proposers: CommitEntries(p, q)
+
+Spec == Init /\ [][Next]_<<prop_state, acc_state, committed, elected_history>>
+
+
+\********************************************************************************
+\* Invariants
+\********************************************************************************
+
+\* Lighter version of ElectionSafetyFull which doesn't require elected_history.
+ElectionSafety ==
+    \A p1, p2 \in proposers:
+        (/\ prop_state[p1].state = "leader"
+         /\ prop_state[p2].state = "leader"
+         /\ prop_state[p1].term = prop_state[p2].term) => (p1 = p2)
+
+\* Single term must never be elected more than once.
+ElectionSafetyFull == \A term \in DOMAIN elected_history: elected_history[term] <= 1
+
+\* Log is expected to be monotonic by <term, lsn> comparison. This is not true
+\* in variants of multi Paxos, but in Raft (and here) it is.
+LogIsMonotonic ==
+    \A a \in acceptors:
+        \A i, j \in DOMAIN acc_state[a].wal:
+            (i > j) => (acc_state[a].wal[i] >= acc_state[a].wal[j])
+
+\* Main invariant: If two entries are committed at the same LSN, they must be
+\* the same entry.
+LogSafety ==
+    \A c1, c2 \in committed: (c1.lsn = c2.lsn) => (c1 = c2)
+
+
+\********************************************************************************
+\* Invariants which don't need to hold, but useful for playing/debugging.
+\********************************************************************************
+
+\* Limits term of elected proposers
+MaxTerm == \A p \in proposers: (prop_state[p].state = "leader" => prop_state[p].term < 2)
+
+MaxAccWalLen == \A a \in acceptors: Len(acc_state[a].wal) < 2
+
+\* Limits max number of committed entries. That way we can check that we'are
+\* actually committing something.
+MaxCommitLsn == Cardinality(committed) < 2
+
+\* How many records with different terms can be removed in single WAL
+\* truncation.
+MaxTruncatedTerms ==
+    \A p \in proposers: \A a \in acceptors:
+        (/\ prop_state[p].state = "leader"
+         /\ prop_state[p].term = acc_state[a].term) =>
+            LET
+                hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a))
+                truncated_lsns == {lsn \in DOMAIN acc_state[a].wal: lsn >= hcp.lsn}
+                truncated_records_terms == {acc_state[a].wal[lsn]: lsn \in truncated_lsns}
+            IN
+                Cardinality(truncated_records_terms) < 2
+
+\* Check that TruncateWal never deletes committed record.
+\* It might seem that this should an invariant, but it is not.
+\* With 5 nodes, it is legit to truncate record which had been
+\* globally committed: e.g. nodes abc can commit record of term 1 in
+\* term 3, and after that leader of term 2 can delete such record
+\* on d. On 10 cores TLC can find such a trace in ~7 hours.
+CommittedNotTruncated ==
+    \A p \in proposers: \A a \in acceptors:
+        (/\ prop_state[p].state = "leader"
+         /\ prop_state[p].term = acc_state[a].term) =>
+            LET
+               hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a))
+               truncated_lsns == {lsn \in DOMAIN acc_state[a].wal: lsn >= hcp.lsn}
+               truncated_records == {[term |-> acc_state[a].wal[lsn], lsn |-> lsn]: lsn \in truncated_lsns}
+            IN
+               \A r \in truncated_records: r \notin committed
+
+====
diff --git a/safekeeper/spec/modelcheck.sh b/safekeeper/spec/modelcheck.sh
new file mode 100755
index 0000000000..21ead7dad8
--- /dev/null
+++ b/safekeeper/spec/modelcheck.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Usage: ./modelcheck.sh <config_file> <spec_file>, e.g.
+# ./modelcheck.sh models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg MCProposerAcceptorStatic.tla
+CONFIG=$1
+SPEC=$2
+
+MEM=7G
+TOOLSPATH="/opt/TLA+Toolbox/tla2tools.jar"
+
+mkdir -p "tlc-results"
+CONFIG_FILE=$(basename -- "$CONFIG")
+outfilename="$SPEC-${CONFIG_FILE}-$(date --utc +%Y-%m-%d--%H-%M-%S)".log
+outfile="tlc-results/$outfilename"
+touch $outfile
+
+# Save some info about the run.
+GIT_REV=`git rev-parse --short HEAD`
+INFO=`uname -a`
+
+# First for Linux, second for Mac.
+CPUNAMELinux=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1')
+CPUCORESLinux=`nproc`
+CPUNAMEMac=`sysctl -n machdep.cpu.brand_string`
+CPUCORESMac=`sysctl -n machdep.cpu.thread_count`
+
+echo "git revision: $GIT_REV" >> $outfile
+echo "Platform: $INFO" >> $outfile
+echo "CPU Info Linux: $CPUNAMELinux" >> $outfile
+echo "CPU Cores Linux: $CPUCORESLinux" >> $outfile
+echo "CPU Info Mac: $CPUNAMEMac" >> $outfile
+echo "CPU Cores Mac: $CPUCORESMac" >> $outfile
+echo "Spec: $SPEC" >> $outfile
+echo "Config: $CONFIG" >> $outfile
+echo "----" >> $outfile
+cat $CONFIG >> $outfile
+echo "" >> $outfile
+echo "----" >> $outfile
+echo "" >> $outfile
+
+# see
+# https://lamport.azurewebsites.net/tla/current-tools.pdf
+# for TLC options.
+# OffHeapDiskFPSet is the optimal fingerprint set implementation
+# https://docs.tlapl.us/codebase:architecture#fingerprint_sets_fpsets
+#
+# Add -simulate to run in infinite simulation mode.
+java -Xmx$MEM -XX:MaxDirectMemorySize=$MEM -XX:+UseParallelGC -Dtlc2.tool.fp.FPSet.impl=tlc2.tool.fp.OffHeapDiskFPSet \
+  -cp "${TOOLSPATH}" tlc2.TLC $SPEC -config $CONFIG -workers auto -gzip | tee -a $outfile
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg
new file mode 100644
index 0000000000..c06109c601
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg
@@ -0,0 +1,19 @@
+\* A very small model just to play.
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 2
+max_entries = 2
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafetyFull
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg
new file mode 100644
index 0000000000..5d10fa960f
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg
@@ -0,0 +1,19 @@
+\* A model next to the smallest one.
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 3
+max_entries = 2
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafetyFull
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg
new file mode 100644
index 0000000000..8ba8ce95a4
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg
@@ -0,0 +1,17 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 3
+max_entries = 3
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg
new file mode 100644
index 0000000000..4763a34ec4
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg
@@ -0,0 +1,17 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 4
+max_entries = 4
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg
new file mode 100644
index 0000000000..ebf4724633
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg
@@ -0,0 +1,16 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3, a4, a5}
+max_term = 2
+max_entries = 2
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg
new file mode 100644
index 0000000000..bb77350c58
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg
@@ -0,0 +1,16 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3, a4, a5}
+max_term = 3
+max_entries = 3
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg
new file mode 100644
index 0000000000..9a5e142f99
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg
@@ -0,0 +1,16 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3, a4, a5}
+max_term = 4
+max_entries = 3
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
diff --git a/safekeeper/spec/readme.md b/safekeeper/spec/readme.md
new file mode 100644
index 0000000000..ec2649d87d
--- /dev/null
+++ b/safekeeper/spec/readme.md
@@ -0,0 +1,12 @@
+The specifications, models and results of running of them of the compute <->
+safekeepers consensus algorithm for committing WAL on the fleet of safekeepers.
+Following Paxos parlance, compute which writes WAL is called (WAL) proposer here
+and safekeepers which persist it are called (WAL) acceptors.
+
+Directory structure:
+- Use modelcheck.sh to run TLC.
+- MC*.tla contains bits of TLA+ needed for TLC like constraining the state space, and models/ actual models.
+- Other .tla files are the actual specs.
+
+Structure is partially borrowed from
+[logless-reconfig](https://github.com/will62794/logless-reconfig), thanks to it.
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log
new file mode 100644
index 0000000000..768722b1eb
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log
@@ -0,0 +1,63 @@
+git revision: 864f4667d
+Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
+CPU Info Linux: Neoverse-N1
+CPU Cores Linux: 80
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorStatic.tla
+Config: models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg
+----
+\* A very small model just to play.
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 2
+max_entries = 2
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
+Running breadth-first search Model-Checking with fp 110 and seed 3949669318051689745 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 46037] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-11123278435718411444/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-11123278435718411444/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-11123278435718411444/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-11123278435718411444/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-11123278435718411444/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-11123278435718411444/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /tmp/tlc-11123278435718411444/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorStatic
+Starting... (2024-11-06 13:44:18)
+Computing initial states...
+Finished computing initial states: 1 distinct state generated at 2024-11-06 13:44:20.
+Model checking completed. No error has been found.
+  Estimates of the probability that TLC did not check all reachable states
+  because two distinct states had the same fingerprint:
+  calculated (optimistic):  val = 2.9E-9
+  based on the actual fingerprints:  val = 4.1E-10
+922134 states generated, 61249 distinct states found, 0 states left on queue.
+The depth of the complete state graph search is 31.
+The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 6 and the 95th percentile is 3).
+Finished in 11s at (2024-11-06 13:44:28)
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log
new file mode 100644
index 0000000000..ae3ba98da6
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log
@@ -0,0 +1,69 @@
+git revision: bcbff084a
+Platform: Linux nonlibrem 6.10.11-amd64 #1 SMP PREEMPT_DYNAMIC Debian 6.10.11-1 (2024-09-22) x86_64 GNU/Linux
+CPU Info Linux: 13th Gen Intel(R) Core(TM) i7-1355U
+CPU Cores Linux: 10
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorStatic.tla
+Config: models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg
+----
+\* A model next to the smallest one.
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 3
+max_entries = 2
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: cc65eef)
+Running breadth-first search Model-Checking with fp 41 and seed -3061068726727581619 with 10 workers on 10 cores with 6372MB heap and 7168MB offheap memory [pid: 1250346] (Linux 6.10.11-amd64 amd64, Debian 21.0.5 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/ars/neon/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-3023124431504466774/TLC.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/ars/neon/neon/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-3023124431504466774/_TLCTrace.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-3023124431504466774/Integers.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-3023124431504466774/Sequences.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-3023124431504466774/FiniteSets.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-3023124431504466774/Naturals.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /tmp/tlc-3023124431504466774/TLCExt.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorStatic
+Starting... (2024-11-15 12:09:59)
+Computing initial states...
+Finished computing initial states: 1 distinct state generated at 2024-11-15 12:10:00.
+Progress(19) at 2024-11-15 12:10:03: 464,696 states generated (464,696 s/min), 57,859 distinct states found (57,859 ds/min), 21,435 states left on queue.
+Progress(26) at 2024-11-15 12:11:03: 8,813,399 states generated (8,348,703 s/min), 877,254 distinct states found (819,395 ds/min), 214,794 states left on queue.
+Progress(27) at 2024-11-15 12:12:03: 16,121,858 states generated (7,308,459 s/min), 1,464,707 distinct states found (587,453 ds/min), 274,230 states left on queue.
+Progress(29) at 2024-11-15 12:13:03: 23,073,903 states generated (6,952,045 s/min), 1,948,802 distinct states found (484,095 ds/min), 263,697 states left on queue.
+Progress(31) at 2024-11-15 12:14:03: 29,740,681 states generated (6,666,778 s/min), 2,331,052 distinct states found (382,250 ds/min), 185,484 states left on queue.
+Progress(34) at 2024-11-15 12:15:03: 36,085,876 states generated (6,345,195 s/min), 2,602,370 distinct states found (271,318 ds/min), 31,659 states left on queue.
+Model checking completed. No error has been found.
+  Estimates of the probability that TLC did not check all reachable states
+  because two distinct states had the same fingerprint:
+  calculated (optimistic):  val = 4.9E-6
+  based on the actual fingerprints:  val = 6.9E-7
+36896322 states generated, 2623542 distinct states found, 0 states left on queue.
+The depth of the complete state graph search is 39.
+The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 7 and the 95th percentile is 3).
+Finished in 05min 14s at (2024-11-15 12:15:13)
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log
new file mode 100644
index 0000000000..46f21cee72
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log
@@ -0,0 +1,72 @@
+git revision: 864f4667d
+Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
+CPU Info Linux: Neoverse-N1
+CPU Cores Linux: 80
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorStatic.tla
+Config: models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg
+----
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 3
+max_entries = 3
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
+Running breadth-first search Model-Checking with fp 126 and seed 2302892334567572769 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 39701] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-15178810317173795942/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-15178810317173795942/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-15178810317173795942/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-15178810317173795942/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-15178810317173795942/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-15178810317173795942/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /tmp/tlc-15178810317173795942/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorStatic
+Starting... (2024-11-06 13:03:52)
+Computing initial states...
+Finished computing initial states: 1 distinct state generated at 2024-11-06 13:03:55.
+Progress(21) at 2024-11-06 13:03:58: 846,240 states generated (846,240 s/min), 106,298 distinct states found (106,298 ds/min), 41,028 states left on queue.
+Progress(28) at 2024-11-06 13:04:58: 27,538,211 states generated (26,691,971 s/min), 2,768,793 distinct states found (2,662,495 ds/min), 782,984 states left on queue.
+Progress(30) at 2024-11-06 13:05:58: 54,048,763 states generated (26,510,552 s/min), 5,076,745 distinct states found (2,307,952 ds/min), 1,241,301 states left on queue.
+Progress(31) at 2024-11-06 13:06:58: 80,554,724 states generated (26,505,961 s/min), 7,199,201 distinct states found (2,122,456 ds/min), 1,541,574 states left on queue.
+Progress(32) at 2024-11-06 13:07:58: 106,991,261 states generated (26,436,537 s/min), 9,121,549 distinct states found (1,922,348 ds/min), 1,686,289 states left on queue.
+Progress(33) at 2024-11-06 13:08:58: 133,354,665 states generated (26,363,404 s/min), 10,935,451 distinct states found (1,813,902 ds/min), 1,739,977 states left on queue.
+Progress(34) at 2024-11-06 13:09:58: 159,631,385 states generated (26,276,720 s/min), 12,605,372 distinct states found (1,669,921 ds/min), 1,677,447 states left on queue.
+Progress(35) at 2024-11-06 13:10:58: 185,862,196 states generated (26,230,811 s/min), 14,138,409 distinct states found (1,533,037 ds/min), 1,501,760 states left on queue.
+Progress(36) at 2024-11-06 13:11:58: 212,021,688 states generated (26,159,492 s/min), 15,538,990 distinct states found (1,400,581 ds/min), 1,216,621 states left on queue.
+Progress(37) at 2024-11-06 13:12:58: 238,046,160 states generated (26,024,472 s/min), 16,778,583 distinct states found (1,239,593 ds/min), 797,230 states left on queue.
+Progress(39) at 2024-11-06 13:13:58: 263,931,163 states generated (25,885,003 s/min), 17,820,786 distinct states found (1,042,203 ds/min), 209,400 states left on queue.
+Model checking completed. No error has been found.
+  Estimates of the probability that TLC did not check all reachable states
+  because two distinct states had the same fingerprint:
+  calculated (optimistic):  val = 2.5E-4
+  based on the actual fingerprints:  val = 7.9E-5
+270257170 states generated, 18005639 distinct states found, 0 states left on queue.
+The depth of the complete state graph search is 47.
+The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 7 and the 95th percentile is 3).
+Finished in 10min 25s at (2024-11-06 13:14:17)
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log
new file mode 100644
index 0000000000..c7cc853af0
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log
@@ -0,0 +1,1466 @@
+# Shows LogSafety violation when "don't commit separately entries from previous terms" check is disabled.
+git revision: 4f1ee6331
+Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
+CPU Info Linux: Neoverse-N1
+CPU Cores Linux: 80
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorStatic.tla
+Config: models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg
+----
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 4
+max_entries = 4
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
+Running breadth-first search Model-Checking with fp 12 and seed -5379034126224420237 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 52295] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-4533438058229992850/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-4533438058229992850/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-4533438058229992850/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-4533438058229992850/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-4533438058229992850/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-4533438058229992850/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /tmp/tlc-4533438058229992850/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorStatic
+Starting... (2024-11-06 14:20:26)
+Computing initial states...
+Finished computing initial states: 1 distinct state generated at 2024-11-06 14:20:29.
+Progress(20) at 2024-11-06 14:20:32: 1,011,898 states generated (1,011,898 s/min), 140,947 distinct states found (140,947 ds/min), 60,535 states left on queue.
+Progress(26) at 2024-11-06 14:21:32: 30,146,518 states generated (29,134,620 s/min), 3,742,736 distinct states found (3,601,789 ds/min), 1,438,779 states left on queue.
+Progress(27) at 2024-11-06 14:22:32: 59,362,708 states generated (29,216,190 s/min), 7,210,233 distinct states found (3,467,497 ds/min), 2,708,295 states left on queue.
+Progress(28) at 2024-11-06 14:23:32: 88,589,291 states generated (29,226,583 s/min), 10,552,781 distinct states found (3,342,548 ds/min), 3,874,296 states left on queue.
+Progress(29) at 2024-11-06 14:24:32: 117,894,209 states generated (29,304,918 s/min), 13,932,498 distinct states found (3,379,717 ds/min), 5,069,960 states left on queue.
+Progress(29) at 2024-11-06 14:25:32: 147,338,882 states generated (29,444,673 s/min), 17,180,069 distinct states found (3,247,571 ds/min), 6,146,371 states left on queue.
+Progress(29) at 2024-11-06 14:26:32: 176,498,135 states generated (29,159,253 s/min), 20,547,926 distinct states found (3,367,857 ds/min), 7,338,835 states left on queue.
+Progress(30) at 2024-11-06 14:27:32: 205,957,044 states generated (29,458,909 s/min), 23,661,090 distinct states found (3,113,164 ds/min), 8,293,570 states left on queue.
+Progress(30) at 2024-11-06 14:28:32: 235,390,133 states generated (29,433,089 s/min), 26,892,306 distinct states found (3,231,216 ds/min), 9,369,229 states left on queue.
+Progress(30) at 2024-11-06 14:29:32: 264,571,938 states generated (29,181,805 s/min), 30,176,971 distinct states found (3,284,665 ds/min), 10,493,429 states left on queue.
+Progress(31) at 2024-11-06 14:30:32: 293,928,191 states generated (29,356,253 s/min), 33,296,160 distinct states found (3,119,189 ds/min), 11,463,686 states left on queue.
+Progress(31) at 2024-11-06 14:31:32: 323,436,668 states generated (29,508,477 s/min), 36,347,973 distinct states found (3,051,813 ds/min), 12,365,578 states left on queue.
+Progress(31) at 2024-11-06 14:32:32: 352,943,790 states generated (29,507,122 s/min), 39,465,244 distinct states found (3,117,271 ds/min), 13,349,544 states left on queue.
+Progress(31) at 2024-11-06 14:33:32: 382,292,863 states generated (29,349,073 s/min), 42,654,621 distinct states found (3,189,377 ds/min), 14,384,363 states left on queue.
+Progress(31) at 2024-11-06 14:34:32: 411,385,854 states generated (29,092,991 s/min), 45,941,145 distinct states found (3,286,524 ds/min), 15,509,450 states left on queue.
+Progress(31) at 2024-11-06 14:35:32: 440,738,756 states generated (29,352,902 s/min), 48,984,566 distinct states found (3,043,421 ds/min), 16,419,882 states left on queue.
+Progress(32) at 2024-11-06 14:36:32: 470,251,558 states generated (29,512,802 s/min), 51,925,693 distinct states found (2,941,127 ds/min), 17,211,457 states left on queue.
+Progress(32) at 2024-11-06 14:37:32: 499,714,013 states generated (29,462,455 s/min), 54,955,581 distinct states found (3,029,888 ds/min), 18,114,624 states left on queue.
+Progress(32) at 2024-11-06 14:38:32: 529,254,608 states generated (29,540,595 s/min), 57,938,914 distinct states found (2,983,333 ds/min), 18,996,128 states left on queue.
+Progress(32) at 2024-11-06 14:39:32: 558,774,398 states generated (29,519,790 s/min), 61,072,943 distinct states found (3,134,029 ds/min), 19,975,689 states left on queue.
+Progress(32) at 2024-11-06 14:40:32: 588,134,665 states generated (29,360,267 s/min), 64,148,888 distinct states found (3,075,945 ds/min), 20,922,407 states left on queue.
+Progress(32) at 2024-11-06 14:41:32: 617,464,374 states generated (29,329,709 s/min), 67,306,855 distinct states found (3,157,967 ds/min), 21,928,799 states left on queue.
+Progress(32) at 2024-11-06 14:42:32: 646,525,281 states generated (29,060,907 s/min), 70,425,194 distinct states found (3,118,339 ds/min), 22,895,971 states left on queue.
+Progress(32) at 2024-11-06 14:43:32: 676,054,893 states generated (29,529,612 s/min), 73,351,905 distinct states found (2,926,711 ds/min), 23,703,779 states left on queue.
+Progress(33) at 2024-11-06 14:44:32: 705,581,782 states generated (29,526,889 s/min), 76,200,615 distinct states found (2,848,710 ds/min), 24,414,094 states left on queue.
+Progress(33) at 2024-11-06 14:45:32: 735,069,836 states generated (29,488,054 s/min), 79,168,244 distinct states found (2,967,629 ds/min), 25,255,224 states left on queue.
+Progress(33) at 2024-11-06 14:46:32: 764,659,188 states generated (29,589,352 s/min), 82,024,430 distinct states found (2,856,186 ds/min), 26,011,047 states left on queue.
+Progress(33) at 2024-11-06 14:47:32: 794,276,423 states generated (29,617,235 s/min), 84,974,312 distinct states found (2,949,882 ds/min), 26,868,750 states left on queue.
+Progress(33) at 2024-11-06 14:48:32: 823,875,831 states generated (29,599,408 s/min), 88,004,386 distinct states found (3,030,074 ds/min), 27,771,984 states left on queue.
+Progress(33) at 2024-11-06 14:49:32: 853,138,894 states generated (29,263,063 s/min), 91,006,890 distinct states found (3,002,504 ds/min), 28,636,661 states left on queue.
+Checkpointing of run states/24-11-06-14-20-25.868
+Checkpointing completed at (2024-11-06 14:50:32)
+Progress(33) at 2024-11-06 14:50:32: 882,514,167 states generated (29,375,273 s/min), 94,011,000 distinct states found (3,004,110 ds/min), 29,534,516 states left on queue.
+Progress(33) at 2024-11-06 14:51:32: 911,838,377 states generated (29,324,210 s/min), 97,108,937 distinct states found (3,097,937 ds/min), 30,498,587 states left on queue.
+Progress(33) at 2024-11-06 14:52:32: 940,646,920 states generated (28,808,543 s/min), 100,248,865 distinct states found (3,139,928 ds/min), 31,472,191 states left on queue.
+Progress(33) at 2024-11-06 14:53:32: 970,074,175 states generated (29,427,255 s/min), 103,170,815 distinct states found (2,921,950 ds/min), 32,265,691 states left on queue.
+Progress(33) at 2024-11-06 14:54:32: 999,627,974 states generated (29,553,799 s/min), 106,004,823 distinct states found (2,834,008 ds/min), 33,009,618 states left on queue.
+Progress(34) at 2024-11-06 14:55:32: 1,029,148,983 states generated (29,521,009 s/min), 108,740,783 distinct states found (2,735,960 ds/min), 33,616,222 states left on queue.
+Progress(34) at 2024-11-06 14:56:32: 1,058,582,001 states generated (29,433,018 s/min), 111,612,965 distinct states found (2,872,182 ds/min), 34,375,212 states left on queue.
+Progress(34) at 2024-11-06 14:57:32: 1,088,123,602 states generated (29,541,601 s/min), 114,464,196 distinct states found (2,851,231 ds/min), 35,116,195 states left on queue.
+Progress(34) at 2024-11-06 14:58:32: 1,117,684,936 states generated (29,561,334 s/min), 117,252,198 distinct states found (2,788,002 ds/min), 35,817,205 states left on queue.
+Progress(34) at 2024-11-06 14:59:32: 1,147,356,249 states generated (29,671,313 s/min), 120,014,476 distinct states found (2,762,278 ds/min), 36,517,255 states left on queue.
+Progress(34) at 2024-11-06 15:00:32: 1,176,921,098 states generated (29,564,849 s/min), 122,859,312 distinct states found (2,844,836 ds/min), 37,291,096 states left on queue.
+Progress(34) at 2024-11-06 15:01:32: 1,206,454,440 states generated (29,533,342 s/min), 125,830,942 distinct states found (2,971,630 ds/min), 38,147,762 states left on queue.
+Progress(34) at 2024-11-06 15:02:32: 1,235,721,673 states generated (29,267,233 s/min), 128,869,493 distinct states found (3,038,551 ds/min), 39,035,481 states left on queue.
+Progress(34) at 2024-11-06 15:03:32: 1,265,097,779 states generated (29,376,106 s/min), 131,669,552 distinct states found (2,800,059 ds/min), 39,746,864 states left on queue.
+Progress(34) at 2024-11-06 15:04:32: 1,294,408,098 states generated (29,310,319 s/min), 134,604,630 distinct states found (2,935,078 ds/min), 40,584,235 states left on queue.
+Progress(34) at 2024-11-06 15:05:32: 1,323,792,755 states generated (29,384,657 s/min), 137,579,390 distinct states found (2,974,760 ds/min), 41,446,478 states left on queue.
+Progress(34) at 2024-11-06 15:06:32: 1,353,085,163 states generated (29,292,408 s/min), 140,575,723 distinct states found (2,996,333 ds/min), 42,309,510 states left on queue.
+Progress(34) at 2024-11-06 15:07:32: 1,381,809,417 states generated (28,724,254 s/min), 143,655,566 distinct states found (3,079,843 ds/min), 43,220,682 states left on queue.
+Progress(34) at 2024-11-06 15:08:32: 1,411,255,848 states generated (29,446,431 s/min), 146,482,192 distinct states found (2,826,626 ds/min), 43,944,938 states left on queue.
+Progress(34) at 2024-11-06 15:09:32: 1,440,646,323 states generated (29,390,475 s/min), 149,419,989 distinct states found (2,937,797 ds/min), 44,763,293 states left on queue.
+Progress(34) at 2024-11-06 15:10:32: 1,470,298,568 states generated (29,652,245 s/min), 152,041,419 distinct states found (2,621,430 ds/min), 45,311,911 states left on queue.
+Progress(35) at 2024-11-06 15:11:32: 1,499,747,712 states generated (29,449,144 s/min), 154,696,867 distinct states found (2,655,448 ds/min), 45,842,895 states left on queue.
+Progress(35) at 2024-11-06 15:12:32: 1,529,256,993 states generated (29,509,281 s/min), 157,493,365 distinct states found (2,796,498 ds/min), 46,535,472 states left on queue.
+Progress(35) at 2024-11-06 15:13:32: 1,558,829,306 states generated (29,572,313 s/min), 160,256,575 distinct states found (2,763,210 ds/min), 47,212,471 states left on queue.
+Progress(35) at 2024-11-06 15:14:32: 1,588,345,878 states generated (29,516,572 s/min), 163,002,602 distinct states found (2,746,027 ds/min), 47,862,117 states left on queue.
+Progress(35) at 2024-11-06 15:15:32: 1,617,885,675 states generated (29,539,797 s/min), 165,699,121 distinct states found (2,696,519 ds/min), 48,472,896 states left on queue.
+Progress(35) at 2024-11-06 15:16:32: 1,647,559,965 states generated (29,674,290 s/min), 168,343,286 distinct states found (2,644,165 ds/min), 49,065,377 states left on queue.
+Progress(35) at 2024-11-06 15:17:32: 1,677,033,250 states generated (29,473,285 s/min), 171,134,409 distinct states found (2,791,123 ds/min), 49,823,330 states left on queue.
+Progress(35) at 2024-11-06 15:18:32: 1,706,730,266 states generated (29,697,016 s/min), 173,860,974 distinct states found (2,726,565 ds/min), 50,493,221 states left on queue.
+Error: Invariant LogSafety is violated.
+Error: The behavior up to this point is:
+State 1: <Initial predicate>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 2: <Vote(p1,a1) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 3: <RestartProposer(p2,{a1, a2}) line 188, col 3 to line 198, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 4: <Vote(p1,a2) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 5: <BecomeLeader(p1) line 222, col 3 to line 245, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 6: <Vote(p2,a1) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 7: <TruncateWal(p1,a2) line 280, col 3 to line 293, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 8: <Vote(p2,a3) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 9: <BecomeLeader(p2) line 222, col 3 to line 245, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 10: <TruncateWal(p2,a1) line 280, col 3 to line 293, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+        nextSendLsn |-> (a1 :> 1) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 11: <RestartProposer(p2,{a1, a2}) line 188, col 3 to line 198, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 12: <Vote(p2,a3) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 13: <NewEntry(p1) line 297, col 3 to line 303, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<1>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 14: <NewEntry(p1) line 297, col 3 to line 303, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 15: <AppendEntry(p1,a2) line 307, col 3 to line 319, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 2) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<1>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 16: <AppendEntry(p1,a2) line 307, col 3 to line 319, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 3) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 17: <RestartProposer(p1,{a1, a3}) line 188, col 3 to line 198, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 18: <Vote(p1,a1) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 19: <Vote(p2,a2) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 20: <BecomeLeader(p2) line 222, col 3 to line 245, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 21: <TruncateWal(p2,a2) line 280, col 3 to line 293, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 22: <TruncateWal(p2,a3) line 280, col 3 to line 293, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 1) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] )
+/\ committed = {}
+
+State 23: <AppendEntry(p2,a3) line 307, col 3 to line 319, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 3,
+        wal |-> <<1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] )
+/\ committed = {}
+
+State 24: <CommitEntries(p2,{a2, a3}) line 329, col 3 to line 345, col 45 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 3,
+        wal |-> <<1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 25: <Vote(p1,a3) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 26: <BecomeLeader(p1) line 222, col 3 to line 245, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 27: <TruncateWal(p1,a3) line 280, col 3 to line 293, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> (a3 :> 1) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 28: <NewEntry(p1) line 297, col 3 to line 303, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> (a3 :> 1) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 29: <AppendEntry(p1,a3) line 307, col 3 to line 319, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> (a3 :> 2) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 30: <TruncateWal(p1,a1) line 280, col 3 to line 293, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> (a1 :> 1 @@ a3 :> 2) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 31: <AppendEntry(p1,a1) line 307, col 3 to line 319, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> (a1 :> 2 @@ a3 :> 2) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 32: <CommitEntries(p1,{a1, a3}) line 329, col 3 to line 345, col 45 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> (a1 :> 2 @@ a3 :> 2) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1], [term |-> 4, lsn |-> 1]}
+
+1712918117 states generated, 174460942 distinct states found, 50658619 states left on queue.
+The depth of the complete state graph search is 35.
+Finished in 58min 19s at (2024-11-06 15:18:45)
+Trace exploration spec path: ./MCProposerAcceptorStatic_TTrace_1730902825.tla
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log
new file mode 100644
index 0000000000..8248240ded
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log
@@ -0,0 +1,1374 @@
+git revision: 4f1ee6331
+Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
+CPU Info Linux: Neoverse-N1
+CPU Cores Linux: 80
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorStatic.tla
+Config: models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg
+----
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 4
+max_entries = 4
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
+Running breadth-first search Model-Checking with fp 84 and seed -1069171980999686913 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 62544] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-6542850091824737097/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-6542850091824737097/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-6542850091824737097/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-6542850091824737097/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-6542850091824737097/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-6542850091824737097/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /tmp/tlc-6542850091824737097/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorStatic
+Starting... (2024-11-06 15:30:45)
+Computing initial states...
+Finished computing initial states: 1 distinct state generated at 2024-11-06 15:30:48.
+Progress(20) at 2024-11-06 15:30:51: 956,386 states generated (956,386 s/min), 134,121 distinct states found (134,121 ds/min), 57,996 states left on queue.
+Progress(27) at 2024-11-06 15:31:51: 30,048,294 states generated (29,091,908 s/min), 3,778,849 distinct states found (3,644,728 ds/min), 1,463,715 states left on queue.
+Progress(28) at 2024-11-06 15:32:51: 59,092,248 states generated (29,043,954 s/min), 7,282,332 distinct states found (3,503,483 ds/min), 2,750,944 states left on queue.
+Progress(29) at 2024-11-06 15:33:51: 88,333,136 states generated (29,240,888 s/min), 10,694,325 distinct states found (3,411,993 ds/min), 3,955,744 states left on queue.
+Progress(29) at 2024-11-06 15:34:51: 117,708,994 states generated (29,375,858 s/min), 14,000,885 distinct states found (3,306,560 ds/min), 5,067,487 states left on queue.
+Progress(30) at 2024-11-06 15:35:51: 146,847,667 states generated (29,138,673 s/min), 17,407,824 distinct states found (3,406,939 ds/min), 6,258,337 states left on queue.
+Progress(30) at 2024-11-06 15:36:51: 176,211,801 states generated (29,364,134 s/min), 20,626,933 distinct states found (3,219,109 ds/min), 7,302,661 states left on queue.
+Progress(31) at 2024-11-06 15:37:51: 205,665,438 states generated (29,453,637 s/min), 23,877,622 distinct states found (3,250,689 ds/min), 8,361,004 states left on queue.
+Progress(31) at 2024-11-06 15:38:51: 234,757,357 states generated (29,091,919 s/min), 27,246,813 distinct states found (3,369,191 ds/min), 9,511,916 states left on queue.
+Progress(31) at 2024-11-06 15:39:51: 264,154,436 states generated (29,397,079 s/min), 30,383,069 distinct states found (3,136,256 ds/min), 10,494,238 states left on queue.
+Progress(31) at 2024-11-06 15:40:51: 293,638,121 states generated (29,483,685 s/min), 33,498,433 distinct states found (3,115,364 ds/min), 11,429,812 states left on queue.
+Progress(32) at 2024-11-06 15:41:51: 323,039,991 states generated (29,401,870 s/min), 36,709,338 distinct states found (3,210,905 ds/min), 12,463,752 states left on queue.
+Progress(32) at 2024-11-06 15:42:51: 352,081,458 states generated (29,041,467 s/min), 39,979,938 distinct states found (3,270,600 ds/min), 13,531,461 states left on queue.
+Progress(32) at 2024-11-06 15:43:51: 381,472,323 states generated (29,390,865 s/min), 43,147,359 distinct states found (3,167,421 ds/min), 14,513,444 states left on queue.
+Progress(32) at 2024-11-06 15:44:51: 410,911,764 states generated (29,439,441 s/min), 46,200,793 distinct states found (3,053,434 ds/min), 15,418,951 states left on queue.
+Progress(32) at 2024-11-06 15:45:51: 440,514,627 states generated (29,602,863 s/min), 49,210,279 distinct states found (3,009,486 ds/min), 16,263,879 states left on queue.
+Progress(33) at 2024-11-06 15:46:51: 470,070,180 states generated (29,555,553 s/min), 52,317,535 distinct states found (3,107,256 ds/min), 17,200,875 states left on queue.
+Progress(33) at 2024-11-06 15:47:51: 499,387,268 states generated (29,317,088 s/min), 55,489,376 distinct states found (3,171,841 ds/min), 18,196,719 states left on queue.
+Progress(33) at 2024-11-06 15:48:51: 528,308,354 states generated (28,921,086 s/min), 58,716,400 distinct states found (3,227,024 ds/min), 19,225,822 states left on queue.
+Progress(33) at 2024-11-06 15:49:51: 557,626,508 states generated (29,318,154 s/min), 61,861,039 distinct states found (3,144,639 ds/min), 20,172,391 states left on queue.
+Progress(33) at 2024-11-06 15:50:51: 587,011,551 states generated (29,385,043 s/min), 64,911,520 distinct states found (3,050,481 ds/min), 21,068,246 states left on queue.
+Progress(33) at 2024-11-06 15:51:51: 616,469,665 states generated (29,458,114 s/min), 67,862,377 distinct states found (2,950,857 ds/min), 21,888,495 states left on queue.
+Progress(33) at 2024-11-06 15:52:51: 646,037,901 states generated (29,568,236 s/min), 70,774,601 distinct states found (2,912,224 ds/min), 22,642,487 states left on queue.
+Progress(33) at 2024-11-06 15:53:51: 675,679,292 states generated (29,641,391 s/min), 73,753,124 distinct states found (2,978,523 ds/min), 23,459,982 states left on queue.
+Progress(34) at 2024-11-06 15:54:51: 705,213,119 states generated (29,533,827 s/min), 76,751,356 distinct states found (2,998,232 ds/min), 24,319,315 states left on queue.
+Progress(34) at 2024-11-06 15:55:51: 734,548,637 states generated (29,335,518 s/min), 79,865,504 distinct states found (3,114,148 ds/min), 25,270,867 states left on queue.
+Progress(34) at 2024-11-06 15:56:51: 763,724,351 states generated (29,175,714 s/min), 82,969,406 distinct states found (3,103,902 ds/min), 26,203,099 states left on queue.
+Progress(34) at 2024-11-06 15:57:51: 792,795,916 states generated (29,071,565 s/min), 86,092,913 distinct states found (3,123,507 ds/min), 27,124,641 states left on queue.
+Progress(34) at 2024-11-06 15:58:51: 822,084,221 states generated (29,288,305 s/min), 89,196,548 distinct states found (3,103,635 ds/min), 28,028,058 states left on queue.
+Progress(34) at 2024-11-06 15:59:51: 851,516,510 states generated (29,432,289 s/min), 92,135,078 distinct states found (2,938,530 ds/min), 28,822,750 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 16:00:51)
+Progress(34) at 2024-11-06 16:00:51: 880,891,436 states generated (29,374,926 s/min), 95,133,622 distinct states found (2,998,544 ds/min), 29,669,470 states left on queue.
+Progress(34) at 2024-11-06 16:01:51: 910,262,536 states generated (29,371,100 s/min), 98,019,631 distinct states found (2,886,009 ds/min), 30,433,293 states left on queue.
+Progress(34) at 2024-11-06 16:02:51: 939,689,255 states generated (29,426,719 s/min), 100,814,884 distinct states found (2,795,253 ds/min), 31,083,132 states left on queue.
+Progress(34) at 2024-11-06 16:03:51: 969,299,651 states generated (29,610,396 s/min), 103,664,772 distinct states found (2,849,888 ds/min), 31,821,093 states left on queue.
+Progress(34) at 2024-11-06 16:04:51: 999,051,292 states generated (29,751,641 s/min), 106,544,287 distinct states found (2,879,515 ds/min), 32,536,946 states left on queue.
+Progress(35) at 2024-11-06 16:05:51: 1,028,690,576 states generated (29,639,284 s/min), 109,444,362 distinct states found (2,900,075 ds/min), 33,326,316 states left on queue.
+Progress(35) at 2024-11-06 16:06:51: 1,058,155,400 states generated (29,464,824 s/min), 112,439,937 distinct states found (2,995,575 ds/min), 34,167,604 states left on queue.
+Progress(35) at 2024-11-06 16:07:51: 1,087,496,744 states generated (29,341,344 s/min), 115,461,649 distinct states found (3,021,712 ds/min), 35,032,974 states left on queue.
+Progress(35) at 2024-11-06 16:08:51: 1,116,663,767 states generated (29,167,023 s/min), 118,482,838 distinct states found (3,021,189 ds/min), 35,902,651 states left on queue.
+Progress(35) at 2024-11-06 16:09:51: 1,145,439,918 states generated (28,776,151 s/min), 121,562,159 distinct states found (3,079,321 ds/min), 36,785,088 states left on queue.
+Progress(35) at 2024-11-06 16:10:51: 1,174,812,354 states generated (29,372,436 s/min), 124,511,721 distinct states found (2,949,562 ds/min), 37,555,204 states left on queue.
+Progress(35) at 2024-11-06 16:11:51: 1,204,150,178 states generated (29,337,824 s/min), 127,579,155 distinct states found (3,067,434 ds/min), 38,425,790 states left on queue.
+Progress(35) at 2024-11-06 16:12:51: 1,233,620,353 states generated (29,470,175 s/min), 130,490,427 distinct states found (2,911,272 ds/min), 39,188,412 states left on queue.
+Progress(35) at 2024-11-06 16:13:51: 1,263,022,331 states generated (29,401,978 s/min), 133,317,160 distinct states found (2,826,733 ds/min), 39,893,070 states left on queue.
+Progress(35) at 2024-11-06 16:14:51: 1,292,411,979 states generated (29,389,648 s/min), 136,229,817 distinct states found (2,912,657 ds/min), 40,666,029 states left on queue.
+Progress(35) at 2024-11-06 16:15:51: 1,321,695,856 states generated (29,283,877 s/min), 139,081,910 distinct states found (2,852,093 ds/min), 41,389,715 states left on queue.
+Progress(35) at 2024-11-06 16:16:51: 1,351,045,560 states generated (29,349,704 s/min), 141,811,662 distinct states found (2,729,752 ds/min), 41,999,267 states left on queue.
+Progress(35) at 2024-11-06 16:17:51: 1,380,677,436 states generated (29,631,876 s/min), 144,516,072 distinct states found (2,704,410 ds/min), 42,579,779 states left on queue.
+Progress(35) at 2024-11-06 16:18:51: 1,410,332,660 states generated (29,655,224 s/min), 147,269,848 distinct states found (2,753,776 ds/min), 43,232,732 states left on queue.
+Progress(35) at 2024-11-06 16:19:51: 1,440,071,594 states generated (29,738,934 s/min), 150,116,683 distinct states found (2,846,835 ds/min), 43,917,859 states left on queue.
+Progress(35) at 2024-11-06 16:20:51: 1,469,737,942 states generated (29,666,348 s/min), 152,881,605 distinct states found (2,764,922 ds/min), 44,594,909 states left on queue.
+Progress(36) at 2024-11-06 16:21:51: 1,499,124,482 states generated (29,386,540 s/min), 155,722,313 distinct states found (2,840,708 ds/min), 45,306,186 states left on queue.
+Progress(36) at 2024-11-06 16:22:51: 1,528,616,635 states generated (29,492,153 s/min), 158,643,911 distinct states found (2,921,598 ds/min), 46,098,600 states left on queue.
+Progress(36) at 2024-11-06 16:23:51: 1,557,820,328 states generated (29,203,693 s/min), 161,651,516 distinct states found (3,007,605 ds/min), 46,958,572 states left on queue.
+Progress(36) at 2024-11-06 16:24:51: 1,587,341,565 states generated (29,521,237 s/min), 164,469,424 distinct states found (2,817,908 ds/min), 47,648,932 states left on queue.
+Progress(36) at 2024-11-06 16:25:51: 1,616,246,807 states generated (28,905,242 s/min), 167,471,199 distinct states found (3,001,775 ds/min), 48,496,844 states left on queue.
+Progress(36) at 2024-11-06 16:26:51: 1,645,107,613 states generated (28,860,806 s/min), 170,454,103 distinct states found (2,982,904 ds/min), 49,283,244 states left on queue.
+Progress(36) at 2024-11-06 16:27:51: 1,674,492,314 states generated (29,384,701 s/min), 173,343,045 distinct states found (2,888,942 ds/min), 50,006,895 states left on queue.
+Progress(36) at 2024-11-06 16:28:51: 1,703,875,027 states generated (29,382,713 s/min), 176,157,623 distinct states found (2,814,578 ds/min), 50,662,128 states left on queue.
+Progress(36) at 2024-11-06 16:29:51: 1,733,099,131 states generated (29,224,104 s/min), 179,186,519 distinct states found (3,028,896 ds/min), 51,498,029 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 16:30:52)
+Progress(36) at 2024-11-06 16:30:52: 1,762,724,622 states generated (29,625,491 s/min), 181,958,595 distinct states found (2,772,076 ds/min), 52,142,450 states left on queue.
+Progress(36) at 2024-11-06 16:31:52: 1,792,118,288 states generated (29,393,666 s/min), 184,725,090 distinct states found (2,766,495 ds/min), 52,785,705 states left on queue.
+Progress(36) at 2024-11-06 16:32:52: 1,821,258,069 states generated (29,139,781 s/min), 187,681,452 distinct states found (2,956,362 ds/min), 53,592,610 states left on queue.
+Progress(36) at 2024-11-06 16:33:52: 1,850,729,054 states generated (29,470,985 s/min), 190,451,722 distinct states found (2,770,270 ds/min), 54,239,919 states left on queue.
+Progress(36) at 2024-11-06 16:34:52: 1,879,860,913 states generated (29,131,859 s/min), 193,207,770 distinct states found (2,756,048 ds/min), 54,886,748 states left on queue.
+Progress(36) at 2024-11-06 16:35:52: 1,909,200,565 states generated (29,339,652 s/min), 195,832,123 distinct states found (2,624,353 ds/min), 55,404,535 states left on queue.
+Progress(36) at 2024-11-06 16:36:52: 1,938,403,873 states generated (29,203,308 s/min), 198,569,916 distinct states found (2,737,793 ds/min), 55,993,675 states left on queue.
+Progress(36) at 2024-11-06 16:37:52: 1,968,097,695 states generated (29,693,822 s/min), 201,148,799 distinct states found (2,578,883 ds/min), 56,501,179 states left on queue.
+Progress(36) at 2024-11-06 16:38:52: 1,997,628,304 states generated (29,530,609 s/min), 203,860,765 distinct states found (2,711,966 ds/min), 57,133,283 states left on queue.
+Progress(36) at 2024-11-06 16:39:52: 2,027,338,755 states generated (29,710,451 s/min), 206,496,491 distinct states found (2,635,726 ds/min), 57,649,914 states left on queue.
+Progress(36) at 2024-11-06 16:40:52: 2,057,072,538 states generated (29,733,783 s/min), 209,189,488 distinct states found (2,692,997 ds/min), 58,229,449 states left on queue.
+Progress(36) at 2024-11-06 16:41:52: 2,086,549,250 states generated (29,476,712 s/min), 211,909,869 distinct states found (2,720,381 ds/min), 58,875,611 states left on queue.
+Progress(37) at 2024-11-06 16:42:52: 2,115,953,926 states generated (29,404,676 s/min), 214,630,876 distinct states found (2,721,007 ds/min), 59,494,220 states left on queue.
+Progress(37) at 2024-11-06 16:43:52: 2,145,423,196 states generated (29,469,270 s/min), 217,412,888 distinct states found (2,782,012 ds/min), 60,176,423 states left on queue.
+Progress(37) at 2024-11-06 16:44:52: 2,174,796,796 states generated (29,373,600 s/min), 220,316,140 distinct states found (2,903,252 ds/min), 60,925,815 states left on queue.
+Progress(37) at 2024-11-06 16:45:52: 2,203,907,384 states generated (29,110,588 s/min), 223,255,125 distinct states found (2,938,985 ds/min), 61,739,564 states left on queue.
+Progress(37) at 2024-11-06 16:46:52: 2,233,378,272 states generated (29,470,888 s/min), 225,995,858 distinct states found (2,740,733 ds/min), 62,364,627 states left on queue.
+Progress(37) at 2024-11-06 16:47:52: 2,262,648,334 states generated (29,270,062 s/min), 228,738,653 distinct states found (2,742,795 ds/min), 63,003,155 states left on queue.
+Progress(37) at 2024-11-06 16:48:52: 2,291,309,648 states generated (28,661,314 s/min), 231,720,498 distinct states found (2,981,845 ds/min), 63,816,162 states left on queue.
+Progress(37) at 2024-11-06 16:49:52: 2,320,153,384 states generated (28,843,736 s/min), 234,599,475 distinct states found (2,878,977 ds/min), 64,513,886 states left on queue.
+Progress(37) at 2024-11-06 16:50:52: 2,349,538,907 states generated (29,385,523 s/min), 237,330,640 distinct states found (2,731,165 ds/min), 65,105,576 states left on queue.
+Progress(37) at 2024-11-06 16:51:52: 2,379,015,082 states generated (29,476,175 s/min), 240,064,625 distinct states found (2,733,985 ds/min), 65,704,108 states left on queue.
+Progress(37) at 2024-11-06 16:52:52: 2,408,376,582 states generated (29,361,500 s/min), 242,869,889 distinct states found (2,805,264 ds/min), 66,339,299 states left on queue.
+Progress(37) at 2024-11-06 16:53:52: 2,437,554,516 states generated (29,177,934 s/min), 245,844,106 distinct states found (2,974,217 ds/min), 67,125,834 states left on queue.
+Progress(37) at 2024-11-06 16:54:52: 2,466,925,193 states generated (29,370,677 s/min), 248,540,587 distinct states found (2,696,481 ds/min), 67,707,623 states left on queue.
+Progress(37) at 2024-11-06 16:55:52: 2,496,386,977 states generated (29,461,784 s/min), 251,318,893 distinct states found (2,778,306 ds/min), 68,345,796 states left on queue.
+Progress(37) at 2024-11-06 16:56:52: 2,525,837,965 states generated (29,450,988 s/min), 253,918,986 distinct states found (2,600,093 ds/min), 68,851,521 states left on queue.
+Progress(37) at 2024-11-06 16:57:52: 2,555,073,687 states generated (29,235,722 s/min), 256,806,753 distinct states found (2,887,767 ds/min), 69,596,597 states left on queue.
+Progress(37) at 2024-11-06 16:58:52: 2,584,381,294 states generated (29,307,607 s/min), 259,714,054 distinct states found (2,907,301 ds/min), 70,335,539 states left on queue.
+Progress(37) at 2024-11-06 16:59:52: 2,613,557,081 states generated (29,175,787 s/min), 262,407,462 distinct states found (2,693,408 ds/min), 70,920,265 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 17:00:53)
+Progress(37) at 2024-11-06 17:00:53: 2,643,168,141 states generated (29,611,060 s/min), 264,973,171 distinct states found (2,565,709 ds/min), 71,384,749 states left on queue.
+Progress(37) at 2024-11-06 17:01:53: 2,672,453,868 states generated (29,285,727 s/min), 267,551,971 distinct states found (2,578,800 ds/min), 71,854,220 states left on queue.
+Progress(37) at 2024-11-06 17:02:53: 2,701,696,399 states generated (29,242,531 s/min), 270,233,135 distinct states found (2,681,164 ds/min), 72,406,567 states left on queue.
+Progress(37) at 2024-11-06 17:03:53: 2,731,216,488 states generated (29,520,089 s/min), 272,711,390 distinct states found (2,478,255 ds/min), 72,805,269 states left on queue.
+Progress(37) at 2024-11-06 17:04:53: 2,760,788,758 states generated (29,572,270 s/min), 275,307,217 distinct states found (2,595,827 ds/min), 73,313,123 states left on queue.
+Progress(37) at 2024-11-06 17:05:53: 2,790,339,552 states generated (29,550,794 s/min), 277,881,113 distinct states found (2,573,896 ds/min), 73,833,900 states left on queue.
+Progress(37) at 2024-11-06 17:06:53: 2,820,046,206 states generated (29,706,654 s/min), 280,371,086 distinct states found (2,489,973 ds/min), 74,231,258 states left on queue.
+Progress(37) at 2024-11-06 17:07:53: 2,849,787,753 states generated (29,741,547 s/min), 283,097,131 distinct states found (2,726,045 ds/min), 74,814,735 states left on queue.
+Progress(37) at 2024-11-06 17:08:53: 2,879,520,949 states generated (29,733,196 s/min), 285,608,053 distinct states found (2,510,922 ds/min), 75,293,894 states left on queue.
+Progress(37) at 2024-11-06 17:09:53: 2,908,889,760 states generated (29,368,811 s/min), 288,274,872 distinct states found (2,666,819 ds/min), 75,880,480 states left on queue.
+Progress(38) at 2024-11-06 17:10:53: 2,938,412,523 states generated (29,522,763 s/min), 290,877,598 distinct states found (2,602,726 ds/min), 76,391,156 states left on queue.
+Progress(38) at 2024-11-06 17:11:53: 2,967,963,455 states generated (29,550,932 s/min), 293,492,146 distinct states found (2,614,548 ds/min), 76,932,124 states left on queue.
+Progress(38) at 2024-11-06 17:12:53: 2,997,327,370 states generated (29,363,915 s/min), 296,353,306 distinct states found (2,861,160 ds/min), 77,659,606 states left on queue.
+Progress(38) at 2024-11-06 17:13:53: 3,026,713,138 states generated (29,385,768 s/min), 299,173,963 distinct states found (2,820,657 ds/min), 78,342,645 states left on queue.
+Progress(38) at 2024-11-06 17:14:53: 3,055,986,492 states generated (29,273,354 s/min), 302,024,049 distinct states found (2,850,086 ds/min), 79,071,501 states left on queue.
+Progress(38) at 2024-11-06 17:15:53: 3,085,491,974 states generated (29,505,482 s/min), 304,668,970 distinct states found (2,644,921 ds/min), 79,608,084 states left on queue.
+Progress(38) at 2024-11-06 17:16:53: 3,114,898,266 states generated (29,406,292 s/min), 307,272,526 distinct states found (2,603,556 ds/min), 80,132,575 states left on queue.
+Progress(38) at 2024-11-06 17:17:53: 3,144,023,490 states generated (29,125,224 s/min), 310,022,073 distinct states found (2,749,547 ds/min), 80,777,238 states left on queue.
+Progress(38) at 2024-11-06 17:18:53: 3,172,762,795 states generated (28,739,305 s/min), 312,891,905 distinct states found (2,869,832 ds/min), 81,497,739 states left on queue.
+Progress(38) at 2024-11-06 17:19:53: 3,201,314,425 states generated (28,551,630 s/min), 315,766,566 distinct states found (2,874,661 ds/min), 82,171,729 states left on queue.
+Progress(38) at 2024-11-06 17:20:53: 3,230,713,777 states generated (29,399,352 s/min), 318,365,612 distinct states found (2,599,046 ds/min), 82,638,018 states left on queue.
+Progress(38) at 2024-11-06 17:21:53: 3,260,188,634 states generated (29,474,857 s/min), 321,040,810 distinct states found (2,675,198 ds/min), 83,185,708 states left on queue.
+Progress(38) at 2024-11-06 17:22:53: 3,289,654,456 states generated (29,465,822 s/min), 323,660,313 distinct states found (2,619,503 ds/min), 83,689,075 states left on queue.
+Progress(38) at 2024-11-06 17:23:53: 3,319,003,677 states generated (29,349,221 s/min), 326,391,347 distinct states found (2,731,034 ds/min), 84,261,368 states left on queue.
+Progress(38) at 2024-11-06 17:24:53: 3,348,330,685 states generated (29,327,008 s/min), 329,204,934 distinct states found (2,813,587 ds/min), 84,925,046 states left on queue.
+Progress(38) at 2024-11-06 17:25:53: 3,377,572,946 states generated (29,242,261 s/min), 331,997,887 distinct states found (2,792,953 ds/min), 85,533,473 states left on queue.
+Progress(38) at 2024-11-06 17:26:53: 3,406,881,714 states generated (29,308,768 s/min), 334,599,745 distinct states found (2,601,858 ds/min), 86,047,276 states left on queue.
+Progress(38) at 2024-11-06 17:27:53: 3,436,375,389 states generated (29,493,675 s/min), 337,261,572 distinct states found (2,661,827 ds/min), 86,591,357 states left on queue.
+Progress(38) at 2024-11-06 17:28:53: 3,465,811,732 states generated (29,436,343 s/min), 339,829,613 distinct states found (2,568,041 ds/min), 87,057,550 states left on queue.
+Progress(38) at 2024-11-06 17:29:53: 3,495,144,983 states generated (29,333,251 s/min), 342,566,275 distinct states found (2,736,662 ds/min), 87,671,131 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 17:30:53)
+Progress(38) at 2024-11-06 17:30:53: 3,524,611,246 states generated (29,466,263 s/min), 345,366,358 distinct states found (2,800,083 ds/min), 88,316,673 states left on queue.
+Progress(38) at 2024-11-06 17:31:53: 3,553,819,331 states generated (29,208,085 s/min), 348,291,666 distinct states found (2,925,308 ds/min), 89,059,679 states left on queue.
+Progress(38) at 2024-11-06 17:32:53: 3,583,208,821 states generated (29,389,490 s/min), 350,796,636 distinct states found (2,504,970 ds/min), 89,478,521 states left on queue.
+Progress(38) at 2024-11-06 17:33:53: 3,612,329,910 states generated (29,121,089 s/min), 353,414,448 distinct states found (2,617,812 ds/min), 90,008,568 states left on queue.
+Progress(38) at 2024-11-06 17:34:53: 3,641,485,253 states generated (29,155,343 s/min), 356,010,441 distinct states found (2,595,993 ds/min), 90,486,313 states left on queue.
+Progress(38) at 2024-11-06 17:35:53: 3,670,761,645 states generated (29,276,392 s/min), 358,411,973 distinct states found (2,401,532 ds/min), 90,799,029 states left on queue.
+Progress(38) at 2024-11-06 17:36:53: 3,700,008,207 states generated (29,246,562 s/min), 360,943,422 distinct states found (2,531,449 ds/min), 91,235,694 states left on queue.
+Progress(38) at 2024-11-06 17:37:53: 3,729,045,761 states generated (29,037,554 s/min), 363,523,499 distinct states found (2,580,077 ds/min), 91,685,579 states left on queue.
+Progress(38) at 2024-11-06 17:38:53: 3,758,697,262 states generated (29,651,501 s/min), 365,860,396 distinct states found (2,336,897 ds/min), 92,003,313 states left on queue.
+Progress(38) at 2024-11-06 17:39:53: 3,788,188,489 states generated (29,491,227 s/min), 368,369,398 distinct states found (2,509,002 ds/min), 92,452,083 states left on queue.
+Progress(38) at 2024-11-06 17:40:53: 3,817,718,772 states generated (29,530,283 s/min), 370,855,965 distinct states found (2,486,567 ds/min), 92,899,812 states left on queue.
+Progress(38) at 2024-11-06 17:41:53: 3,847,372,748 states generated (29,653,976 s/min), 373,231,774 distinct states found (2,375,809 ds/min), 93,202,503 states left on queue.
+Progress(38) at 2024-11-06 17:42:53: 3,877,091,950 states generated (29,719,202 s/min), 375,934,374 distinct states found (2,702,600 ds/min), 93,775,105 states left on queue.
+Progress(38) at 2024-11-06 17:43:53: 3,906,843,295 states generated (29,751,345 s/min), 378,304,497 distinct states found (2,370,123 ds/min), 94,098,611 states left on queue.
+Progress(38) at 2024-11-06 17:44:53: 3,936,304,033 states generated (29,460,738 s/min), 380,793,774 distinct states found (2,489,277 ds/min), 94,560,398 states left on queue.
+Progress(38) at 2024-11-06 17:45:53: 3,965,687,311 states generated (29,383,278 s/min), 383,366,376 distinct states found (2,572,602 ds/min), 95,062,163 states left on queue.
+Progress(38) at 2024-11-06 17:46:53: 3,995,264,758 states generated (29,577,447 s/min), 385,832,314 distinct states found (2,465,938 ds/min), 95,460,777 states left on queue.
+Progress(38) at 2024-11-06 17:47:53: 4,024,519,333 states generated (29,254,575 s/min), 388,384,282 distinct states found (2,551,968 ds/min), 95,931,698 states left on queue.
+Progress(38) at 2024-11-06 17:48:53: 4,054,053,752 states generated (29,534,419 s/min), 390,990,581 distinct states found (2,606,299 ds/min), 96,493,705 states left on queue.
+Progress(38) at 2024-11-06 17:49:53: 4,083,403,606 states generated (29,349,854 s/min), 393,717,328 distinct states found (2,726,747 ds/min), 97,099,592 states left on queue.
+Progress(38) at 2024-11-06 17:50:53: 4,112,753,694 states generated (29,350,088 s/min), 396,441,909 distinct states found (2,724,581 ds/min), 97,694,523 states left on queue.
+Progress(38) at 2024-11-06 17:51:53: 4,141,940,951 states generated (29,187,257 s/min), 399,238,612 distinct states found (2,796,703 ds/min), 98,387,103 states left on queue.
+Progress(38) at 2024-11-06 17:52:53: 4,171,185,273 states generated (29,244,322 s/min), 401,861,376 distinct states found (2,622,764 ds/min), 98,900,168 states left on queue.
+Progress(38) at 2024-11-06 17:53:53: 4,200,735,055 states generated (29,549,782 s/min), 404,419,627 distinct states found (2,558,251 ds/min), 99,388,507 states left on queue.
+Progress(38) at 2024-11-06 17:54:53: 4,230,057,902 states generated (29,322,847 s/min), 406,926,477 distinct states found (2,506,850 ds/min), 99,826,562 states left on queue.
+Progress(38) at 2024-11-06 17:55:53: 4,259,279,515 states generated (29,221,613 s/min), 409,512,606 distinct states found (2,586,129 ds/min), 100,340,214 states left on queue.
+Progress(38) at 2024-11-06 17:56:53: 4,288,265,663 states generated (28,986,148 s/min), 412,254,402 distinct states found (2,741,796 ds/min), 100,966,036 states left on queue.
+Progress(38) at 2024-11-06 17:57:53: 4,316,798,413 states generated (28,532,750 s/min), 415,047,481 distinct states found (2,793,079 ds/min), 101,589,869 states left on queue.
+Progress(38) at 2024-11-06 17:58:53: 4,345,527,290 states generated (28,728,877 s/min), 417,768,588 distinct states found (2,721,107 ds/min), 102,133,503 states left on queue.
+Progress(38) at 2024-11-06 17:59:53: 4,374,924,942 states generated (29,397,652 s/min), 420,254,082 distinct states found (2,485,494 ds/min), 102,500,461 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 18:00:54)
+Progress(38) at 2024-11-06 18:00:54: 4,404,604,911 states generated (29,679,969 s/min), 422,801,691 distinct states found (2,547,609 ds/min), 102,936,440 states left on queue.
+Progress(38) at 2024-11-06 18:01:54: 4,434,018,901 states generated (29,413,990 s/min), 425,477,119 distinct states found (2,675,428 ds/min), 103,472,987 states left on queue.
+Progress(38) at 2024-11-06 18:02:54: 4,463,498,297 states generated (29,479,396 s/min), 427,949,289 distinct states found (2,472,170 ds/min), 103,858,839 states left on queue.
+Progress(38) at 2024-11-06 18:03:54: 4,492,775,931 states generated (29,277,634 s/min), 430,592,094 distinct states found (2,642,805 ds/min), 104,353,609 states left on queue.
+Progress(38) at 2024-11-06 18:04:54: 4,522,002,300 states generated (29,226,369 s/min), 433,322,584 distinct states found (2,730,490 ds/min), 104,949,753 states left on queue.
+Progress(38) at 2024-11-06 18:05:54: 4,551,375,180 states generated (29,372,880 s/min), 436,005,138 distinct states found (2,682,554 ds/min), 105,482,546 states left on queue.
+Progress(38) at 2024-11-06 18:06:54: 4,580,718,169 states generated (29,342,989 s/min), 438,516,579 distinct states found (2,511,441 ds/min), 105,868,435 states left on queue.
+Progress(38) at 2024-11-06 18:07:54: 4,609,859,344 states generated (29,141,175 s/min), 441,134,700 distinct states found (2,618,121 ds/min), 106,390,335 states left on queue.
+Progress(38) at 2024-11-06 18:08:54: 4,639,331,150 states generated (29,471,806 s/min), 443,662,679 distinct states found (2,527,979 ds/min), 106,821,264 states left on queue.
+Progress(38) at 2024-11-06 18:09:54: 4,668,696,820 states generated (29,365,670 s/min), 446,222,969 distinct states found (2,560,290 ds/min), 107,277,508 states left on queue.
+Progress(38) at 2024-11-06 18:10:54: 4,698,140,829 states generated (29,444,009 s/min), 448,693,022 distinct states found (2,470,053 ds/min), 107,654,262 states left on queue.
+Progress(38) at 2024-11-06 18:11:54: 4,727,380,985 states generated (29,240,156 s/min), 451,459,276 distinct states found (2,766,254 ds/min), 108,284,101 states left on queue.
+Progress(38) at 2024-11-06 18:12:54: 4,756,654,088 states generated (29,273,103 s/min), 454,180,180 distinct states found (2,720,904 ds/min), 108,879,205 states left on queue.
+Progress(38) at 2024-11-06 18:13:54: 4,785,893,104 states generated (29,239,016 s/min), 457,001,077 distinct states found (2,820,897 ds/min), 109,511,015 states left on queue.
+Progress(38) at 2024-11-06 18:14:54: 4,815,289,339 states generated (29,396,235 s/min), 459,530,340 distinct states found (2,529,263 ds/min), 109,951,588 states left on queue.
+Progress(38) at 2024-11-06 18:15:54: 4,844,354,767 states generated (29,065,428 s/min), 462,144,567 distinct states found (2,614,227 ds/min), 110,455,692 states left on queue.
+Progress(38) at 2024-11-06 18:16:54: 4,873,381,465 states generated (29,026,698 s/min), 464,718,128 distinct states found (2,573,561 ds/min), 110,936,992 states left on queue.
+Progress(38) at 2024-11-06 18:17:54: 4,902,616,179 states generated (29,234,714 s/min), 467,171,620 distinct states found (2,453,492 ds/min), 111,288,450 states left on queue.
+Progress(38) at 2024-11-06 18:18:54: 4,931,808,383 states generated (29,192,204 s/min), 469,593,253 distinct states found (2,421,633 ds/min), 111,607,240 states left on queue.
+Progress(38) at 2024-11-06 18:19:54: 4,961,319,800 states generated (29,511,417 s/min), 471,795,067 distinct states found (2,201,814 ds/min), 111,770,077 states left on queue.
+Progress(38) at 2024-11-06 18:20:54: 4,990,051,892 states generated (28,732,092 s/min), 474,595,717 distinct states found (2,800,650 ds/min), 112,380,795 states left on queue.
+Progress(38) at 2024-11-06 18:21:54: 5,019,620,389 states generated (29,568,497 s/min), 476,860,178 distinct states found (2,264,461 ds/min), 112,610,789 states left on queue.
+Progress(38) at 2024-11-06 18:22:54: 5,049,176,225 states generated (29,555,836 s/min), 479,117,000 distinct states found (2,256,822 ds/min), 112,849,809 states left on queue.
+Progress(38) at 2024-11-06 18:23:54: 5,078,659,511 states generated (29,483,286 s/min), 481,552,566 distinct states found (2,435,566 ds/min), 113,238,679 states left on queue.
+Progress(38) at 2024-11-06 18:24:54: 5,108,186,428 states generated (29,526,917 s/min), 483,970,290 distinct states found (2,417,724 ds/min), 113,645,974 states left on queue.
+Progress(38) at 2024-11-06 18:25:54: 5,137,766,496 states generated (29,580,068 s/min), 486,204,445 distinct states found (2,234,155 ds/min), 113,816,273 states left on queue.
+Progress(38) at 2024-11-06 18:26:54: 5,167,429,477 states generated (29,662,981 s/min), 488,726,479 distinct states found (2,522,034 ds/min), 114,265,425 states left on queue.
+Progress(38) at 2024-11-06 18:27:54: 5,197,227,715 states generated (29,798,238 s/min), 491,213,848 distinct states found (2,487,369 ds/min), 114,645,624 states left on queue.
+Progress(38) at 2024-11-06 18:28:54: 5,226,883,420 states generated (29,655,705 s/min), 493,480,968 distinct states found (2,267,120 ds/min), 114,901,786 states left on queue.
+Progress(38) at 2024-11-06 18:29:54: 5,256,355,905 states generated (29,472,485 s/min), 495,866,549 distinct states found (2,385,581 ds/min), 115,277,276 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 18:30:55)
+Progress(38) at 2024-11-06 18:30:55: 5,286,035,252 states generated (29,679,347 s/min), 498,324,679 distinct states found (2,458,130 ds/min), 115,663,015 states left on queue.
+Progress(38) at 2024-11-06 18:31:55: 5,315,467,724 states generated (29,432,472 s/min), 500,723,577 distinct states found (2,398,898 ds/min), 116,023,619 states left on queue.
+Progress(38) at 2024-11-06 18:32:55: 5,344,728,453 states generated (29,260,729 s/min), 503,156,876 distinct states found (2,433,299 ds/min), 116,384,801 states left on queue.
+Progress(38) at 2024-11-06 18:33:55: 5,374,055,231 states generated (29,326,778 s/min), 505,588,957 distinct states found (2,432,081 ds/min), 116,786,679 states left on queue.
+Progress(38) at 2024-11-06 18:34:55: 5,403,566,278 states generated (29,511,047 s/min), 508,096,703 distinct states found (2,507,746 ds/min), 117,258,425 states left on queue.
+Progress(38) at 2024-11-06 18:35:55: 5,432,770,932 states generated (29,204,654 s/min), 510,765,370 distinct states found (2,668,667 ds/min), 117,821,443 states left on queue.
+Progress(38) at 2024-11-06 18:36:55: 5,462,325,607 states generated (29,554,675 s/min), 513,306,027 distinct states found (2,540,657 ds/min), 118,252,946 states left on queue.
+Progress(38) at 2024-11-06 18:37:55: 5,491,531,381 states generated (29,205,774 s/min), 516,017,383 distinct states found (2,711,356 ds/min), 118,857,035 states left on queue.
+Progress(38) at 2024-11-06 18:38:55: 5,520,744,572 states generated (29,213,191 s/min), 518,696,783 distinct states found (2,679,400 ds/min), 119,445,954 states left on queue.
+Progress(38) at 2024-11-06 18:39:55: 5,549,903,819 states generated (29,159,247 s/min), 521,329,662 distinct states found (2,632,879 ds/min), 119,977,569 states left on queue.
+Progress(38) at 2024-11-06 18:40:55: 5,579,474,839 states generated (29,571,020 s/min), 523,702,578 distinct states found (2,372,916 ds/min), 120,289,041 states left on queue.
+Progress(38) at 2024-11-06 18:41:55: 5,608,757,550 states generated (29,282,711 s/min), 526,191,629 distinct states found (2,489,051 ds/min), 120,719,632 states left on queue.
+Progress(38) at 2024-11-06 18:42:55: 5,638,085,090 states generated (29,327,540 s/min), 528,478,505 distinct states found (2,286,876 ds/min), 120,990,568 states left on queue.
+Progress(38) at 2024-11-06 18:43:55: 5,667,141,833 states generated (29,056,743 s/min), 531,035,593 distinct states found (2,557,088 ds/min), 121,480,763 states left on queue.
+Progress(38) at 2024-11-06 18:44:55: 5,696,139,104 states generated (28,997,271 s/min), 533,684,330 distinct states found (2,648,737 ds/min), 122,027,516 states left on queue.
+Progress(38) at 2024-11-06 18:45:55: 5,724,868,902 states generated (28,729,798 s/min), 536,316,715 distinct states found (2,632,385 ds/min), 122,548,317 states left on queue.
+Progress(38) at 2024-11-06 18:46:55: 5,753,438,871 states generated (28,569,969 s/min), 539,001,028 distinct states found (2,684,313 ds/min), 123,041,578 states left on queue.
+Progress(38) at 2024-11-06 18:47:55: 5,782,391,778 states generated (28,952,907 s/min), 541,537,259 distinct states found (2,536,231 ds/min), 123,436,184 states left on queue.
+Progress(38) at 2024-11-06 18:48:55: 5,811,823,996 states generated (29,432,218 s/min), 543,896,432 distinct states found (2,359,173 ds/min), 123,698,698 states left on queue.
+Progress(38) at 2024-11-06 18:49:55: 5,841,258,941 states generated (29,434,945 s/min), 546,273,191 distinct states found (2,376,759 ds/min), 124,012,754 states left on queue.
+Progress(38) at 2024-11-06 18:50:55: 5,870,667,995 states generated (29,409,054 s/min), 548,835,686 distinct states found (2,562,495 ds/min), 124,450,482 states left on queue.
+Progress(38) at 2024-11-06 18:51:55: 5,900,038,718 states generated (29,370,723 s/min), 551,304,457 distinct states found (2,468,771 ds/min), 124,805,220 states left on queue.
+Progress(38) at 2024-11-06 18:52:55: 5,929,442,421 states generated (29,403,703 s/min), 553,776,296 distinct states found (2,471,839 ds/min), 125,178,608 states left on queue.
+Progress(38) at 2024-11-06 18:53:55: 5,958,838,496 states generated (29,396,075 s/min), 556,289,762 distinct states found (2,513,466 ds/min), 125,588,158 states left on queue.
+Progress(38) at 2024-11-06 18:54:55: 5,988,187,325 states generated (29,348,829 s/min), 558,898,224 distinct states found (2,608,462 ds/min), 126,074,377 states left on queue.
+Progress(38) at 2024-11-06 18:55:55: 6,017,546,111 states generated (29,358,786 s/min), 561,530,468 distinct states found (2,632,244 ds/min), 126,579,784 states left on queue.
+Progress(38) at 2024-11-06 18:56:55: 6,046,777,143 states generated (29,231,032 s/min), 564,182,546 distinct states found (2,652,078 ds/min), 127,037,883 states left on queue.
+Progress(39) at 2024-11-06 18:57:55: 6,076,111,479 states generated (29,334,336 s/min), 566,509,898 distinct states found (2,327,352 ds/min), 127,319,036 states left on queue.
+Progress(39) at 2024-11-06 18:58:55: 6,105,215,668 states generated (29,104,189 s/min), 569,000,954 distinct states found (2,491,056 ds/min), 127,724,185 states left on queue.
+Progress(39) at 2024-11-06 18:59:55: 6,134,619,650 states generated (29,403,982 s/min), 571,444,199 distinct states found (2,443,245 ds/min), 128,083,849 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 19:00:55)
+Progress(39) at 2024-11-06 19:00:55: 6,164,303,226 states generated (29,683,576 s/min), 574,046,920 distinct states found (2,602,721 ds/min), 128,537,330 states left on queue.
+Progress(39) at 2024-11-06 19:01:55: 6,193,710,515 states generated (29,407,289 s/min), 576,294,161 distinct states found (2,247,241 ds/min), 128,749,186 states left on queue.
+Progress(39) at 2024-11-06 19:02:55: 6,223,050,437 states generated (29,339,922 s/min), 578,840,811 distinct states found (2,546,650 ds/min), 129,198,375 states left on queue.
+Progress(39) at 2024-11-06 19:03:55: 6,252,273,339 states generated (29,222,902 s/min), 581,530,481 distinct states found (2,689,670 ds/min), 129,745,195 states left on queue.
+Progress(39) at 2024-11-06 19:04:55: 6,281,535,213 states generated (29,261,874 s/min), 584,206,969 distinct states found (2,676,488 ds/min), 130,306,182 states left on queue.
+Progress(39) at 2024-11-06 19:05:55: 6,310,569,147 states generated (29,033,934 s/min), 587,031,959 distinct states found (2,824,990 ds/min), 130,922,629 states left on queue.
+Progress(39) at 2024-11-06 19:06:55: 6,339,951,741 states generated (29,382,594 s/min), 589,709,668 distinct states found (2,677,709 ds/min), 131,483,555 states left on queue.
+Progress(39) at 2024-11-06 19:07:55: 6,369,354,481 states generated (29,402,740 s/min), 591,964,654 distinct states found (2,254,986 ds/min), 131,688,532 states left on queue.
+Progress(39) at 2024-11-06 19:08:55: 6,398,254,591 states generated (28,900,110 s/min), 594,604,924 distinct states found (2,640,270 ds/min), 132,195,069 states left on queue.
+Progress(39) at 2024-11-06 19:09:55: 6,427,422,756 states generated (29,168,165 s/min), 597,059,083 distinct states found (2,454,159 ds/min), 132,571,626 states left on queue.
+Progress(39) at 2024-11-06 19:10:55: 6,456,469,721 states generated (29,046,965 s/min), 599,400,317 distinct states found (2,341,234 ds/min), 132,826,474 states left on queue.
+Progress(39) at 2024-11-06 19:11:55: 6,485,733,442 states generated (29,263,721 s/min), 602,040,336 distinct states found (2,640,019 ds/min), 133,286,664 states left on queue.
+Progress(39) at 2024-11-06 19:12:55: 6,515,001,998 states generated (29,268,556 s/min), 604,003,958 distinct states found (1,963,622 ds/min), 133,255,252 states left on queue.
+Progress(39) at 2024-11-06 19:13:55: 6,544,172,146 states generated (29,170,148 s/min), 606,473,164 distinct states found (2,469,206 ds/min), 133,627,323 states left on queue.
+Progress(39) at 2024-11-06 19:14:55: 6,572,975,355 states generated (28,803,209 s/min), 609,043,606 distinct states found (2,570,442 ds/min), 134,023,262 states left on queue.
+Progress(39) at 2024-11-06 19:15:55: 6,602,534,934 states generated (29,559,579 s/min), 611,212,652 distinct states found (2,169,046 ds/min), 134,205,070 states left on queue.
+Progress(39) at 2024-11-06 19:16:55: 6,632,044,851 states generated (29,509,917 s/min), 613,377,378 distinct states found (2,164,726 ds/min), 134,360,577 states left on queue.
+Progress(39) at 2024-11-06 19:17:55: 6,661,465,356 states generated (29,420,505 s/min), 615,729,605 distinct states found (2,352,227 ds/min), 134,679,148 states left on queue.
+Progress(39) at 2024-11-06 19:18:55: 6,690,848,776 states generated (29,383,420 s/min), 618,034,126 distinct states found (2,304,521 ds/min), 134,989,999 states left on queue.
+Progress(39) at 2024-11-06 19:19:55: 6,720,362,641 states generated (29,513,865 s/min), 620,264,990 distinct states found (2,230,864 ds/min), 135,213,527 states left on queue.
+Progress(39) at 2024-11-06 19:20:55: 6,749,995,972 states generated (29,633,331 s/min), 622,424,423 distinct states found (2,159,433 ds/min), 135,336,269 states left on queue.
+Progress(39) at 2024-11-06 19:21:55: 6,779,641,479 states generated (29,645,507 s/min), 624,953,002 distinct states found (2,528,579 ds/min), 135,781,717 states left on queue.
+Progress(39) at 2024-11-06 19:22:55: 6,809,496,805 states generated (29,855,326 s/min), 627,297,563 distinct states found (2,344,561 ds/min), 136,040,988 states left on queue.
+Progress(39) at 2024-11-06 19:23:55: 6,839,096,708 states generated (29,599,903 s/min), 629,464,688 distinct states found (2,167,125 ds/min), 136,210,971 states left on queue.
+Progress(39) at 2024-11-06 19:24:55: 6,868,614,311 states generated (29,517,603 s/min), 631,704,627 distinct states found (2,239,939 ds/min), 136,469,731 states left on queue.
+Progress(39) at 2024-11-06 19:25:55: 6,897,932,930 states generated (29,318,619 s/min), 633,961,042 distinct states found (2,256,415 ds/min), 136,714,912 states left on queue.
+Progress(39) at 2024-11-06 19:26:55: 6,927,200,602 states generated (29,267,672 s/min), 636,414,800 distinct states found (2,453,758 ds/min), 137,101,547 states left on queue.
+Progress(39) at 2024-11-06 19:27:55: 6,956,755,074 states generated (29,554,472 s/min), 638,616,489 distinct states found (2,201,689 ds/min), 137,285,238 states left on queue.
+Progress(39) at 2024-11-06 19:28:55: 6,985,926,285 states generated (29,171,211 s/min), 640,970,274 distinct states found (2,353,785 ds/min), 137,592,586 states left on queue.
+Progress(39) at 2024-11-06 19:29:55: 7,015,240,294 states generated (29,314,009 s/min), 643,310,280 distinct states found (2,340,006 ds/min), 137,914,322 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 19:30:56)
+Progress(39) at 2024-11-06 19:30:56: 7,045,112,039 states generated (29,871,745 s/min), 645,650,251 distinct states found (2,339,971 ds/min), 138,248,533 states left on queue.
+Progress(39) at 2024-11-06 19:31:56: 7,074,347,122 states generated (29,235,083 s/min), 648,286,341 distinct states found (2,636,090 ds/min), 138,800,606 states left on queue.
+Progress(39) at 2024-11-06 19:32:56: 7,103,701,427 states generated (29,354,305 s/min), 650,776,754 distinct states found (2,490,413 ds/min), 139,200,935 states left on queue.
+Progress(39) at 2024-11-06 19:33:56: 7,133,125,574 states generated (29,424,147 s/min), 653,222,778 distinct states found (2,446,024 ds/min), 139,553,972 states left on queue.
+Progress(39) at 2024-11-06 19:34:56: 7,162,393,954 states generated (29,268,380 s/min), 655,812,815 distinct states found (2,590,037 ds/min), 140,051,736 states left on queue.
+Progress(39) at 2024-11-06 19:35:56: 7,191,614,309 states generated (29,220,355 s/min), 658,388,779 distinct states found (2,575,964 ds/min), 140,550,430 states left on queue.
+Progress(39) at 2024-11-06 19:36:56: 7,220,841,977 states generated (29,227,668 s/min), 660,885,901 distinct states found (2,497,122 ds/min), 140,973,038 states left on queue.
+Progress(39) at 2024-11-06 19:37:56: 7,250,020,241 states generated (29,178,264 s/min), 663,335,701 distinct states found (2,449,800 ds/min), 141,327,800 states left on queue.
+Progress(39) at 2024-11-06 19:38:56: 7,279,545,923 states generated (29,525,682 s/min), 665,706,252 distinct states found (2,370,551 ds/min), 141,666,628 states left on queue.
+Progress(39) at 2024-11-06 19:39:56: 7,308,806,585 states generated (29,260,662 s/min), 668,059,763 distinct states found (2,353,511 ds/min), 141,985,139 states left on queue.
+Progress(39) at 2024-11-06 19:40:56: 7,338,028,888 states generated (29,222,303 s/min), 670,241,848 distinct states found (2,182,085 ds/min), 142,169,842 states left on queue.
+Progress(39) at 2024-11-06 19:41:56: 7,367,241,753 states generated (29,212,865 s/min), 672,613,255 distinct states found (2,371,407 ds/min), 142,507,724 states left on queue.
+Progress(39) at 2024-11-06 19:42:56: 7,396,269,434 states generated (29,027,681 s/min), 675,112,517 distinct states found (2,499,262 ds/min), 142,941,967 states left on queue.
+Progress(39) at 2024-11-06 19:43:56: 7,425,237,701 states generated (28,968,267 s/min), 677,646,850 distinct states found (2,534,333 ds/min), 143,388,301 states left on queue.
+Progress(39) at 2024-11-06 19:44:56: 7,453,929,312 states generated (28,691,611 s/min), 680,183,486 distinct states found (2,536,636 ds/min), 143,823,998 states left on queue.
+Progress(39) at 2024-11-06 19:45:56: 7,482,605,282 states generated (28,675,970 s/min), 682,751,269 distinct states found (2,567,783 ds/min), 144,211,694 states left on queue.
+Progress(39) at 2024-11-06 19:46:56: 7,511,402,194 states generated (28,796,912 s/min), 685,177,338 distinct states found (2,426,069 ds/min), 144,502,576 states left on queue.
+Progress(39) at 2024-11-06 19:47:56: 7,540,667,315 states generated (29,265,121 s/min), 687,470,422 distinct states found (2,293,084 ds/min), 144,717,485 states left on queue.
+Progress(39) at 2024-11-06 19:48:56: 7,570,065,371 states generated (29,398,056 s/min), 689,724,172 distinct states found (2,253,750 ds/min), 144,895,541 states left on queue.
+Progress(39) at 2024-11-06 19:49:56: 7,599,596,791 states generated (29,531,420 s/min), 692,064,101 distinct states found (2,339,929 ds/min), 145,171,911 states left on queue.
+Progress(39) at 2024-11-06 19:50:56: 7,629,011,363 states generated (29,414,572 s/min), 694,540,161 distinct states found (2,476,060 ds/min), 145,540,423 states left on queue.
+Progress(39) at 2024-11-06 19:51:56: 7,658,453,965 states generated (29,442,602 s/min), 696,912,122 distinct states found (2,371,961 ds/min), 145,809,567 states left on queue.
+Progress(39) at 2024-11-06 19:52:56: 7,687,913,137 states generated (29,459,172 s/min), 699,240,630 distinct states found (2,328,508 ds/min), 146,098,273 states left on queue.
+Progress(39) at 2024-11-06 19:53:56: 7,717,161,254 states generated (29,248,117 s/min), 701,789,915 distinct states found (2,549,285 ds/min), 146,502,121 states left on queue.
+Progress(39) at 2024-11-06 19:54:56: 7,746,587,948 states generated (29,426,694 s/min), 704,037,014 distinct states found (2,247,099 ds/min), 146,684,369 states left on queue.
+Progress(39) at 2024-11-06 19:55:56: 7,775,767,241 states generated (29,179,293 s/min), 706,750,225 distinct states found (2,713,211 ds/min), 147,270,858 states left on queue.
+Progress(39) at 2024-11-06 19:56:56: 7,805,143,313 states generated (29,376,072 s/min), 709,214,940 distinct states found (2,464,715 ds/min), 147,627,166 states left on queue.
+Progress(39) at 2024-11-06 19:57:56: 7,834,403,478 states generated (29,260,165 s/min), 711,759,633 distinct states found (2,544,693 ds/min), 147,996,842 states left on queue.
+Progress(40) at 2024-11-06 19:58:56: 7,863,785,909 states generated (29,382,431 s/min), 713,915,903 distinct states found (2,156,270 ds/min), 148,107,480 states left on queue.
+Progress(40) at 2024-11-06 19:59:56: 7,892,661,923 states generated (28,876,014 s/min), 716,529,052 distinct states found (2,613,149 ds/min), 148,615,346 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 20:00:57)
+Progress(40) at 2024-11-06 20:00:57: 7,922,354,868 states generated (29,692,945 s/min), 718,724,840 distinct states found (2,195,788 ds/min), 148,760,464 states left on queue.
+Progress(40) at 2024-11-06 20:01:57: 7,951,821,345 states generated (29,466,477 s/min), 721,199,790 distinct states found (2,474,950 ds/min), 149,133,458 states left on queue.
+Progress(40) at 2024-11-06 20:02:57: 7,981,212,562 states generated (29,391,217 s/min), 723,637,084 distinct states found (2,437,294 ds/min), 149,453,388 states left on queue.
+Progress(40) at 2024-11-06 20:03:57: 8,010,639,344 states generated (29,426,782 s/min), 725,776,597 distinct states found (2,139,513 ds/min), 149,580,205 states left on queue.
+Progress(40) at 2024-11-06 20:04:57: 8,039,970,078 states generated (29,330,734 s/min), 728,145,896 distinct states found (2,369,299 ds/min), 149,873,787 states left on queue.
+Progress(40) at 2024-11-06 20:05:57: 8,069,221,501 states generated (29,251,423 s/min), 730,835,980 distinct states found (2,690,084 ds/min), 150,431,663 states left on queue.
+Progress(40) at 2024-11-06 20:06:57: 8,098,568,645 states generated (29,347,144 s/min), 733,266,238 distinct states found (2,430,258 ds/min), 150,772,190 states left on queue.
+Progress(40) at 2024-11-06 20:07:57: 8,127,646,970 states generated (29,078,325 s/min), 736,001,441 distinct states found (2,735,203 ds/min), 151,368,297 states left on queue.
+Progress(40) at 2024-11-06 20:08:57: 8,156,755,007 states generated (29,108,037 s/min), 738,759,675 distinct states found (2,758,234 ds/min), 151,912,929 states left on queue.
+Progress(40) at 2024-11-06 20:09:57: 8,186,234,810 states generated (29,479,803 s/min), 741,336,146 distinct states found (2,576,471 ds/min), 152,376,828 states left on queue.
+Progress(40) at 2024-11-06 20:10:57: 8,215,641,994 states generated (29,407,184 s/min), 743,647,353 distinct states found (2,311,207 ds/min), 152,617,899 states left on queue.
+Progress(40) at 2024-11-06 20:11:57: 8,244,746,445 states generated (29,104,451 s/min), 746,080,007 distinct states found (2,432,654 ds/min), 152,939,104 states left on queue.
+Progress(40) at 2024-11-06 20:12:57: 8,273,514,095 states generated (28,767,650 s/min), 748,726,701 distinct states found (2,646,694 ds/min), 153,445,645 states left on queue.
+Progress(40) at 2024-11-06 20:13:57: 8,302,647,011 states generated (29,132,916 s/min), 751,041,420 distinct states found (2,314,719 ds/min), 153,711,631 states left on queue.
+Progress(40) at 2024-11-06 20:14:57: 8,331,785,512 states generated (29,138,501 s/min), 753,262,324 distinct states found (2,220,904 ds/min), 153,861,206 states left on queue.
+Progress(40) at 2024-11-06 20:15:57: 8,361,058,813 states generated (29,273,301 s/min), 755,881,803 distinct states found (2,619,479 ds/min), 154,293,451 states left on queue.
+Progress(40) at 2024-11-06 20:16:57: 8,390,323,842 states generated (29,265,029 s/min), 757,769,813 distinct states found (1,888,010 ds/min), 154,184,183 states left on queue.
+Progress(40) at 2024-11-06 20:17:57: 8,419,579,524 states generated (29,255,682 s/min), 760,009,795 distinct states found (2,239,982 ds/min), 154,382,656 states left on queue.
+Progress(40) at 2024-11-06 20:18:57: 8,448,394,343 states generated (28,814,819 s/min), 762,597,225 distinct states found (2,587,430 ds/min), 154,795,314 states left on queue.
+Progress(40) at 2024-11-06 20:19:57: 8,477,530,142 states generated (29,135,799 s/min), 764,903,184 distinct states found (2,305,959 ds/min), 154,997,361 states left on queue.
+Progress(40) at 2024-11-06 20:20:57: 8,507,035,930 states generated (29,505,788 s/min), 766,887,142 distinct states found (1,983,958 ds/min), 155,034,831 states left on queue.
+Progress(40) at 2024-11-06 20:21:57: 8,536,505,703 states generated (29,469,773 s/min), 769,048,483 distinct states found (2,161,341 ds/min), 155,183,742 states left on queue.
+Progress(40) at 2024-11-06 20:22:57: 8,565,867,584 states generated (29,361,881 s/min), 771,258,076 distinct states found (2,209,593 ds/min), 155,385,262 states left on queue.
+Progress(40) at 2024-11-06 20:23:57: 8,595,185,764 states generated (29,318,180 s/min), 773,454,985 distinct states found (2,196,909 ds/min), 155,614,111 states left on queue.
+Progress(40) at 2024-11-06 20:24:57: 8,624,496,269 states generated (29,310,505 s/min), 775,619,630 distinct states found (2,164,645 ds/min), 155,798,174 states left on queue.
+Progress(40) at 2024-11-06 20:25:57: 8,654,080,073 states generated (29,583,804 s/min), 777,637,410 distinct states found (2,017,780 ds/min), 155,782,045 states left on queue.
+Progress(40) at 2024-11-06 20:26:57: 8,683,722,009 states generated (29,641,936 s/min), 779,940,399 distinct states found (2,302,989 ds/min), 156,073,330 states left on queue.
+Progress(40) at 2024-11-06 20:27:57: 8,713,410,725 states generated (29,688,716 s/min), 782,406,987 distinct states found (2,466,588 ds/min), 156,445,902 states left on queue.
+Progress(40) at 2024-11-06 20:28:57: 8,743,158,002 states generated (29,747,277 s/min), 784,542,609 distinct states found (2,135,622 ds/min), 156,539,841 states left on queue.
+Progress(40) at 2024-11-06 20:29:57: 8,772,688,809 states generated (29,530,807 s/min), 786,583,608 distinct states found (2,040,999 ds/min), 156,630,041 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 20:30:57)
+Progress(40) at 2024-11-06 20:30:57: 8,802,299,219 states generated (29,610,410 s/min), 788,709,007 distinct states found (2,125,399 ds/min), 156,780,966 states left on queue.
+Progress(40) at 2024-11-06 20:31:57: 8,831,545,663 states generated (29,246,444 s/min), 790,874,634 distinct states found (2,165,627 ds/min), 156,943,688 states left on queue.
+Progress(40) at 2024-11-06 20:32:57: 8,860,742,526 states generated (29,196,863 s/min), 793,218,612 distinct states found (2,343,978 ds/min), 157,247,738 states left on queue.
+Progress(40) at 2024-11-06 20:33:57: 8,890,145,689 states generated (29,403,163 s/min), 795,347,746 distinct states found (2,129,134 ds/min), 157,376,715 states left on queue.
+Progress(40) at 2024-11-06 20:34:57: 8,919,277,440 states generated (29,131,751 s/min), 797,557,991 distinct states found (2,210,245 ds/min), 157,566,508 states left on queue.
+Progress(40) at 2024-11-06 20:35:57: 8,948,368,355 states generated (29,090,915 s/min), 799,870,441 distinct states found (2,312,450 ds/min), 157,825,337 states left on queue.
+Progress(40) at 2024-11-06 20:36:57: 8,977,811,769 states generated (29,443,414 s/min), 801,992,418 distinct states found (2,121,977 ds/min), 158,015,008 states left on queue.
+Progress(40) at 2024-11-06 20:37:57: 9,007,285,675 states generated (29,473,906 s/min), 804,250,024 distinct states found (2,257,606 ds/min), 158,295,507 states left on queue.
+Progress(40) at 2024-11-06 20:38:57: 9,036,450,953 states generated (29,165,278 s/min), 806,795,860 distinct states found (2,545,836 ds/min), 158,767,907 states left on queue.
+Progress(40) at 2024-11-06 20:39:57: 9,065,704,268 states generated (29,253,315 s/min), 809,198,438 distinct states found (2,402,578 ds/min), 159,105,121 states left on queue.
+Progress(40) at 2024-11-06 20:40:57: 9,095,165,427 states generated (29,461,159 s/min), 811,512,584 distinct states found (2,314,146 ds/min), 159,345,117 states left on queue.
+Progress(40) at 2024-11-06 20:41:57: 9,124,541,297 states generated (29,375,870 s/min), 813,905,920 distinct states found (2,393,336 ds/min), 159,672,325 states left on queue.
+Progress(40) at 2024-11-06 20:42:57: 9,153,712,591 states generated (29,171,294 s/min), 816,392,570 distinct states found (2,486,650 ds/min), 160,082,547 states left on queue.
+Progress(40) at 2024-11-06 20:43:57: 9,182,920,866 states generated (29,208,275 s/min), 818,845,538 distinct states found (2,452,968 ds/min), 160,476,056 states left on queue.
+Progress(40) at 2024-11-06 20:44:57: 9,212,093,614 states generated (29,172,748 s/min), 821,212,595 distinct states found (2,367,057 ds/min), 160,787,698 states left on queue.
+Progress(40) at 2024-11-06 20:45:57: 9,241,177,362 states generated (29,083,748 s/min), 823,731,111 distinct states found (2,518,516 ds/min), 161,227,975 states left on queue.
+Progress(40) at 2024-11-06 20:46:57: 9,270,666,448 states generated (29,489,086 s/min), 825,877,262 distinct states found (2,146,151 ds/min), 161,339,209 states left on queue.
+Progress(40) at 2024-11-06 20:47:57: 9,299,985,513 states generated (29,319,065 s/min), 828,195,512 distinct states found (2,318,250 ds/min), 161,644,069 states left on queue.
+Progress(40) at 2024-11-06 20:48:57: 9,329,155,005 states generated (29,169,492 s/min), 830,386,518 distinct states found (2,191,006 ds/min), 161,807,802 states left on queue.
+Progress(40) at 2024-11-06 20:49:57: 9,358,433,771 states generated (29,278,766 s/min), 832,419,931 distinct states found (2,033,413 ds/min), 161,882,018 states left on queue.
+Progress(40) at 2024-11-06 20:50:57: 9,387,665,287 states generated (29,231,516 s/min), 834,751,267 distinct states found (2,331,336 ds/min), 162,183,217 states left on queue.
+Progress(40) at 2024-11-06 20:51:57: 9,416,697,647 states generated (29,032,360 s/min), 837,127,657 distinct states found (2,376,390 ds/min), 162,511,558 states left on queue.
+Progress(40) at 2024-11-06 20:52:57: 9,445,747,666 states generated (29,050,019 s/min), 839,556,372 distinct states found (2,428,715 ds/min), 162,873,418 states left on queue.
+Progress(40) at 2024-11-06 20:53:57: 9,474,599,613 states generated (28,851,947 s/min), 841,985,780 distinct states found (2,429,408 ds/min), 163,231,531 states left on queue.
+Progress(40) at 2024-11-06 20:54:57: 9,503,408,525 states generated (28,808,912 s/min), 844,368,680 distinct states found (2,382,900 ds/min), 163,533,407 states left on queue.
+Progress(40) at 2024-11-06 20:55:57: 9,532,128,492 states generated (28,719,967 s/min), 846,804,519 distinct states found (2,435,839 ds/min), 163,787,695 states left on queue.
+Progress(40) at 2024-11-06 20:56:57: 9,560,935,598 states generated (28,807,106 s/min), 849,075,143 distinct states found (2,270,624 ds/min), 163,946,240 states left on queue.
+Progress(40) at 2024-11-06 20:57:57: 9,590,127,374 states generated (29,191,776 s/min), 851,260,378 distinct states found (2,185,235 ds/min), 164,077,372 states left on queue.
+Progress(40) at 2024-11-06 20:58:57: 9,619,514,341 states generated (29,386,967 s/min), 853,352,738 distinct states found (2,092,360 ds/min), 164,118,186 states left on queue.
+Progress(40) at 2024-11-06 20:59:57: 9,648,985,302 states generated (29,470,961 s/min), 855,543,408 distinct states found (2,190,670 ds/min), 164,279,076 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 21:00:58)
+Progress(40) at 2024-11-06 21:00:58: 9,678,677,722 states generated (29,692,420 s/min), 857,894,775 distinct states found (2,351,367 ds/min), 164,516,395 states left on queue.
+Progress(40) at 2024-11-06 21:01:58: 9,708,095,509 states generated (29,417,787 s/min), 860,383,155 distinct states found (2,488,380 ds/min), 164,898,153 states left on queue.
+Progress(40) at 2024-11-06 21:02:58: 9,737,488,378 states generated (29,392,869 s/min), 862,497,194 distinct states found (2,114,039 ds/min), 164,966,010 states left on queue.
+Progress(40) at 2024-11-06 21:03:58: 9,766,895,552 states generated (29,407,174 s/min), 864,819,525 distinct states found (2,322,331 ds/min), 165,232,701 states left on queue.
+Progress(40) at 2024-11-06 21:04:58: 9,796,208,300 states generated (29,312,748 s/min), 867,276,841 distinct states found (2,457,316 ds/min), 165,568,613 states left on queue.
+Progress(40) at 2024-11-06 21:05:58: 9,825,603,726 states generated (29,395,426 s/min), 869,434,526 distinct states found (2,157,685 ds/min), 165,685,610 states left on queue.
+Progress(40) at 2024-11-06 21:06:58: 9,854,789,772 states generated (29,186,046 s/min), 871,934,034 distinct states found (2,499,508 ds/min), 166,084,000 states left on queue.
+Progress(40) at 2024-11-06 21:07:58: 9,884,028,390 states generated (29,238,618 s/min), 874,443,659 distinct states found (2,509,625 ds/min), 166,483,652 states left on queue.
+Progress(40) at 2024-11-06 21:08:58: 9,913,377,669 states generated (29,349,279 s/min), 876,803,913 distinct states found (2,360,254 ds/min), 166,740,702 states left on queue.
+Progress(40) at 2024-11-06 21:09:58: 9,942,721,749 states generated (29,344,080 s/min), 879,187,270 distinct states found (2,383,357 ds/min), 166,953,562 states left on queue.
+Progress(41) at 2024-11-06 21:10:58: 9,972,078,704 states generated (29,356,955 s/min), 881,233,361 distinct states found (2,046,091 ds/min), 166,999,841 states left on queue.
+Progress(41) at 2024-11-06 21:11:58: 10,000,914,792 states generated (28,836,088 s/min), 883,811,441 distinct states found (2,578,080 ds/min), 167,466,583 states left on queue.
+Progress(41) at 2024-11-06 21:12:58: 10,030,210,434 states generated (29,295,642 s/min), 885,899,950 distinct states found (2,088,509 ds/min), 167,531,826 states left on queue.
+Progress(41) at 2024-11-06 21:13:58: 10,059,587,070 states generated (29,376,636 s/min), 888,188,669 distinct states found (2,288,719 ds/min), 167,753,242 states left on queue.
+Progress(41) at 2024-11-06 21:14:58: 10,089,078,901 states generated (29,491,831 s/min), 890,649,997 distinct states found (2,461,328 ds/min), 168,098,890 states left on queue.
+Progress(41) at 2024-11-06 21:15:58: 10,118,348,352 states generated (29,269,451 s/min), 892,695,892 distinct states found (2,045,895 ds/min), 168,141,532 states left on queue.
+Progress(41) at 2024-11-06 21:16:58: 10,147,644,676 states generated (29,296,324 s/min), 894,823,997 distinct states found (2,128,105 ds/min), 168,231,032 states left on queue.
+Progress(41) at 2024-11-06 21:17:58: 10,176,967,773 states generated (29,323,097 s/min), 897,225,523 distinct states found (2,401,526 ds/min), 168,555,740 states left on queue.
+Progress(41) at 2024-11-06 21:18:58: 10,206,275,174 states generated (29,307,401 s/min), 899,814,626 distinct states found (2,589,103 ds/min), 169,020,971 states left on queue.
+Progress(41) at 2024-11-06 21:19:58: 10,235,593,993 states generated (29,318,819 s/min), 902,141,356 distinct states found (2,326,730 ds/min), 169,267,251 states left on queue.
+Progress(41) at 2024-11-06 21:20:58: 10,264,799,049 states generated (29,205,056 s/min), 904,746,333 distinct states found (2,604,977 ds/min), 169,758,459 states left on queue.
+Progress(41) at 2024-11-06 21:21:58: 10,293,910,586 states generated (29,111,537 s/min), 907,433,182 distinct states found (2,686,849 ds/min), 170,277,176 states left on queue.
+Progress(41) at 2024-11-06 21:22:58: 10,323,190,750 states generated (29,280,164 s/min), 910,052,108 distinct states found (2,618,926 ds/min), 170,695,212 states left on queue.
+Progress(41) at 2024-11-06 21:23:58: 10,352,580,182 states generated (29,389,432 s/min), 912,516,064 distinct states found (2,463,956 ds/min), 171,083,771 states left on queue.
+Progress(41) at 2024-11-06 21:24:58: 10,381,951,479 states generated (29,371,297 s/min), 914,781,443 distinct states found (2,265,379 ds/min), 171,281,545 states left on queue.
+Progress(41) at 2024-11-06 21:25:58: 10,411,026,945 states generated (29,075,466 s/min), 917,078,052 distinct states found (2,296,609 ds/min), 171,498,613 states left on queue.
+Progress(41) at 2024-11-06 21:26:58: 10,439,904,441 states generated (28,877,496 s/min), 919,547,808 distinct states found (2,469,756 ds/min), 171,860,589 states left on queue.
+Progress(41) at 2024-11-06 21:27:58: 10,469,008,600 states generated (29,104,159 s/min), 921,912,547 distinct states found (2,364,739 ds/min), 172,121,551 states left on queue.
+Progress(41) at 2024-11-06 21:28:58: 10,497,834,986 states generated (28,826,386 s/min), 924,235,840 distinct states found (2,323,293 ds/min), 172,353,661 states left on queue.
+Progress(41) at 2024-11-06 21:29:58: 10,527,064,696 states generated (29,229,710 s/min), 926,456,744 distinct states found (2,220,904 ds/min), 172,508,439 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 21:30:59)
+Progress(41) at 2024-11-06 21:30:59: 10,556,579,142 states generated (29,514,446 s/min), 928,988,872 distinct states found (2,532,128 ds/min), 172,833,183 states left on queue.
+Progress(41) at 2024-11-06 21:31:59: 10,585,719,909 states generated (29,140,767 s/min), 930,745,149 distinct states found (1,756,277 ds/min), 172,622,496 states left on queue.
+Progress(41) at 2024-11-06 21:32:59: 10,614,881,115 states generated (29,161,206 s/min), 932,948,083 distinct states found (2,202,934 ds/min), 172,792,818 states left on queue.
+Progress(41) at 2024-11-06 21:33:59: 10,643,693,909 states generated (28,812,794 s/min), 935,441,862 distinct states found (2,493,779 ds/min), 173,119,721 states left on queue.
+Progress(41) at 2024-11-06 21:34:59: 10,672,671,166 states generated (28,977,257 s/min), 937,653,961 distinct states found (2,212,099 ds/min), 173,216,843 states left on queue.
+Progress(41) at 2024-11-06 21:35:59: 10,702,072,440 states generated (29,401,274 s/min), 939,638,920 distinct states found (1,984,959 ds/min), 173,254,076 states left on queue.
+Progress(41) at 2024-11-06 21:36:59: 10,731,415,292 states generated (29,342,852 s/min), 941,583,653 distinct states found (1,944,733 ds/min), 173,229,968 states left on queue.
+Progress(41) at 2024-11-06 21:37:59: 10,760,802,656 states generated (29,387,364 s/min), 943,770,610 distinct states found (2,186,957 ds/min), 173,412,799 states left on queue.
+Progress(41) at 2024-11-06 21:38:59: 10,789,961,996 states generated (29,159,340 s/min), 945,790,519 distinct states found (2,019,909 ds/min), 173,482,204 states left on queue.
+Progress(41) at 2024-11-06 21:39:59: 10,819,303,972 states generated (29,341,976 s/min), 947,902,156 distinct states found (2,111,637 ds/min), 173,640,941 states left on queue.
+Progress(41) at 2024-11-06 21:40:59: 10,848,636,471 states generated (29,332,499 s/min), 949,908,145 distinct states found (2,005,989 ds/min), 173,684,074 states left on queue.
+Progress(41) at 2024-11-06 21:41:59: 10,878,207,345 states generated (29,570,874 s/min), 951,870,784 distinct states found (1,962,639 ds/min), 173,648,255 states left on queue.
+Progress(41) at 2024-11-06 21:42:59: 10,907,777,091 states generated (29,569,746 s/min), 954,123,321 distinct states found (2,252,537 ds/min), 173,881,583 states left on queue.
+Progress(41) at 2024-11-06 21:43:59: 10,937,383,465 states generated (29,606,374 s/min), 956,486,701 distinct states found (2,363,380 ds/min), 174,173,694 states left on queue.
+Progress(41) at 2024-11-06 21:44:59: 10,967,070,713 states generated (29,687,248 s/min), 958,539,717 distinct states found (2,053,016 ds/min), 174,194,592 states left on queue.
+Progress(41) at 2024-11-06 21:45:59: 10,996,524,132 states generated (29,453,419 s/min), 960,439,766 distinct states found (1,900,049 ds/min), 174,165,777 states left on queue.
+Progress(41) at 2024-11-06 21:46:59: 11,025,919,452 states generated (29,395,320 s/min), 962,518,661 distinct states found (2,078,895 ds/min), 174,284,642 states left on queue.
+Progress(41) at 2024-11-06 21:47:59: 11,055,087,136 states generated (29,167,684 s/min), 964,440,130 distinct states found (1,921,469 ds/min), 174,253,951 states left on queue.
+Progress(41) at 2024-11-06 21:48:59: 11,084,346,164 states generated (29,259,028 s/min), 966,652,841 distinct states found (2,212,711 ds/min), 174,452,762 states left on queue.
+Progress(41) at 2024-11-06 21:49:59: 11,113,503,996 states generated (29,157,832 s/min), 968,786,590 distinct states found (2,133,749 ds/min), 174,578,147 states left on queue.
+Progress(41) at 2024-11-06 21:50:59: 11,142,862,327 states generated (29,358,331 s/min), 970,780,918 distinct states found (1,994,328 ds/min), 174,585,050 states left on queue.
+Progress(41) at 2024-11-06 21:51:59: 11,171,907,560 states generated (29,045,233 s/min), 972,924,432 distinct states found (2,143,514 ds/min), 174,718,189 states left on queue.
+Progress(41) at 2024-11-06 21:52:59: 11,201,055,602 states generated (29,148,042 s/min), 975,106,131 distinct states found (2,181,699 ds/min), 174,874,035 states left on queue.
+Progress(41) at 2024-11-06 21:53:59: 11,230,576,268 states generated (29,520,666 s/min), 977,176,048 distinct states found (2,069,917 ds/min), 175,042,666 states left on queue.
+Progress(41) at 2024-11-06 21:54:59: 11,259,928,257 states generated (29,351,989 s/min), 979,337,351 distinct states found (2,161,303 ds/min), 175,248,665 states left on queue.
+Progress(41) at 2024-11-06 21:55:59: 11,289,190,366 states generated (29,262,109 s/min), 981,837,130 distinct states found (2,499,779 ds/min), 175,680,736 states left on queue.
+Progress(41) at 2024-11-06 21:56:59: 11,318,399,828 states generated (29,209,462 s/min), 984,112,195 distinct states found (2,275,065 ds/min), 175,913,580 states left on queue.
+Progress(41) at 2024-11-06 21:57:59: 11,347,862,845 states generated (29,463,017 s/min), 986,368,069 distinct states found (2,255,874 ds/min), 176,126,523 states left on queue.
+Progress(41) at 2024-11-06 21:58:59: 11,377,318,937 states generated (29,456,092 s/min), 988,548,686 distinct states found (2,180,617 ds/min), 176,253,552 states left on queue.
+Progress(41) at 2024-11-06 21:59:59: 11,406,551,913 states generated (29,232,976 s/min), 990,875,071 distinct states found (2,326,385 ds/min), 176,528,465 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 22:00:59)
+Progress(41) at 2024-11-06 22:00:59: 11,436,006,666 states generated (29,454,753 s/min), 993,234,999 distinct states found (2,359,928 ds/min), 176,816,755 states left on queue.
+Progress(41) at 2024-11-06 22:01:59: 11,465,207,151 states generated (29,200,485 s/min), 995,557,179 distinct states found (2,322,180 ds/min), 177,094,397 states left on queue.
+Progress(41) at 2024-11-06 22:02:59: 11,494,298,575 states generated (29,091,424 s/min), 997,927,812 distinct states found (2,370,633 ds/min), 177,411,890 states left on queue.
+Progress(41) at 2024-11-06 22:03:59: 11,523,576,632 states generated (29,278,057 s/min), 1,000,196,030 distinct states found (2,268,218 ds/min), 177,640,656 states left on queue.
+Progress(41) at 2024-11-06 22:04:59: 11,552,734,483 states generated (29,157,851 s/min), 1,002,452,277 distinct states found (2,256,247 ds/min), 177,827,247 states left on queue.
+Progress(41) at 2024-11-06 22:05:59: 11,582,200,298 states generated (29,465,815 s/min), 1,004,593,818 distinct states found (2,141,541 ds/min), 177,983,707 states left on queue.
+Progress(41) at 2024-11-06 22:06:59: 11,611,484,149 states generated (29,283,851 s/min), 1,006,774,383 distinct states found (2,180,565 ds/min), 178,161,577 states left on queue.
+Progress(41) at 2024-11-06 22:07:59: 11,640,449,232 states generated (28,965,083 s/min), 1,008,870,356 distinct states found (2,095,973 ds/min), 178,245,657 states left on queue.
+Progress(41) at 2024-11-06 22:08:59: 11,669,695,402 states generated (29,246,170 s/min), 1,010,743,262 distinct states found (1,872,906 ds/min), 178,199,630 states left on queue.
+Progress(41) at 2024-11-06 22:09:59: 11,698,855,657 states generated (29,160,255 s/min), 1,012,993,163 distinct states found (2,249,901 ds/min), 178,433,806 states left on queue.
+Progress(41) at 2024-11-06 22:10:59: 11,727,873,536 states generated (29,017,879 s/min), 1,015,222,628 distinct states found (2,229,465 ds/min), 178,645,315 states left on queue.
+Progress(41) at 2024-11-06 22:11:59: 11,756,910,696 states generated (29,037,160 s/min), 1,017,493,811 distinct states found (2,271,183 ds/min), 178,885,854 states left on queue.
+Progress(41) at 2024-11-06 22:12:59: 11,785,841,957 states generated (28,931,261 s/min), 1,019,798,730 distinct states found (2,304,919 ds/min), 179,138,831 states left on queue.
+Progress(41) at 2024-11-06 22:13:59: 11,814,627,351 states generated (28,785,394 s/min), 1,022,115,935 distinct states found (2,317,205 ds/min), 179,401,355 states left on queue.
+Progress(41) at 2024-11-06 22:14:59: 11,843,482,288 states generated (28,854,937 s/min), 1,024,372,991 distinct states found (2,257,056 ds/min), 179,570,167 states left on queue.
+Progress(41) at 2024-11-06 22:15:59: 11,872,232,503 states generated (28,750,215 s/min), 1,026,655,919 distinct states found (2,282,928 ds/min), 179,704,400 states left on queue.
+Progress(41) at 2024-11-06 22:16:59: 11,901,011,327 states generated (28,778,824 s/min), 1,028,780,151 distinct states found (2,124,232 ds/min), 179,744,822 states left on queue.
+Progress(41) at 2024-11-06 22:17:59: 11,930,078,061 states generated (29,066,734 s/min), 1,030,863,673 distinct states found (2,083,522 ds/min), 179,790,662 states left on queue.
+Progress(41) at 2024-11-06 22:18:59: 11,959,463,901 states generated (29,385,840 s/min), 1,032,840,344 distinct states found (1,976,671 ds/min), 179,738,442 states left on queue.
+Progress(41) at 2024-11-06 22:19:59: 11,988,811,132 states generated (29,347,231 s/min), 1,034,897,049 distinct states found (2,056,705 ds/min), 179,788,782 states left on queue.
+Progress(41) at 2024-11-06 22:20:59: 12,018,335,911 states generated (29,524,779 s/min), 1,037,158,579 distinct states found (2,261,530 ds/min), 179,978,226 states left on queue.
+Progress(41) at 2024-11-06 22:21:59: 12,047,755,593 states generated (29,419,682 s/min), 1,039,437,623 distinct states found (2,279,044 ds/min), 180,177,371 states left on queue.
+Progress(41) at 2024-11-06 22:22:59: 12,077,111,001 states generated (29,355,408 s/min), 1,041,672,961 distinct states found (2,235,338 ds/min), 180,336,777 states left on queue.
+Progress(41) at 2024-11-06 22:23:59: 12,106,556,177 states generated (29,445,176 s/min), 1,043,675,880 distinct states found (2,002,919 ds/min), 180,345,759 states left on queue.
+Progress(41) at 2024-11-06 22:24:59: 12,135,797,446 states generated (29,241,269 s/min), 1,045,966,606 distinct states found (2,290,726 ds/min), 180,552,887 states left on queue.
+Progress(41) at 2024-11-06 22:25:59: 12,165,143,756 states generated (29,346,310 s/min), 1,048,373,643 distinct states found (2,407,037 ds/min), 180,860,142 states left on queue.
+Progress(41) at 2024-11-06 22:26:59: 12,194,478,236 states generated (29,334,480 s/min), 1,050,403,560 distinct states found (2,029,917 ds/min), 180,873,811 states left on queue.
+Progress(41) at 2024-11-06 22:27:59: 12,223,653,080 states generated (29,174,844 s/min), 1,052,798,502 distinct states found (2,394,942 ds/min), 181,184,025 states left on queue.
+Progress(41) at 2024-11-06 22:28:59: 12,252,926,784 states generated (29,273,704 s/min), 1,055,243,990 distinct states found (2,445,488 ds/min), 181,542,525 states left on queue.
+Progress(41) at 2024-11-06 22:29:59: 12,282,176,071 states generated (29,249,287 s/min), 1,057,488,489 distinct states found (2,244,499 ds/min), 181,704,266 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 22:31:00)
+Progress(41) at 2024-11-06 22:31:00: 12,311,654,529 states generated (29,478,458 s/min), 1,059,789,296 distinct states found (2,300,807 ds/min), 181,875,392 states left on queue.
+Progress(41) at 2024-11-06 22:32:00: 12,340,837,903 states generated (29,183,374 s/min), 1,061,857,294 distinct states found (2,067,998 ds/min), 181,860,690 states left on queue.
+Progress(41) at 2024-11-06 22:33:00: 12,369,978,352 states generated (29,140,449 s/min), 1,063,943,173 distinct states found (2,085,879 ds/min), 181,951,091 states left on queue.
+Progress(41) at 2024-11-06 22:34:00: 12,398,820,660 states generated (28,842,308 s/min), 1,066,384,327 distinct states found (2,441,154 ds/min), 182,284,376 states left on queue.
+Progress(41) at 2024-11-06 22:35:00: 12,427,966,245 states generated (29,145,585 s/min), 1,068,376,116 distinct states found (1,991,789 ds/min), 182,275,982 states left on queue.
+Progress(41) at 2024-11-06 22:36:00: 12,457,300,671 states generated (29,334,426 s/min), 1,070,596,949 distinct states found (2,220,833 ds/min), 182,442,278 states left on queue.
+Progress(41) at 2024-11-06 22:37:00: 12,486,769,483 states generated (29,468,812 s/min), 1,072,968,640 distinct states found (2,371,691 ds/min), 182,718,485 states left on queue.
+Progress(41) at 2024-11-06 22:38:00: 12,516,031,360 states generated (29,261,877 s/min), 1,075,001,378 distinct states found (2,032,738 ds/min), 182,729,966 states left on queue.
+Progress(41) at 2024-11-06 22:39:00: 12,545,265,331 states generated (29,233,971 s/min), 1,076,880,794 distinct states found (1,879,416 ds/min), 182,634,798 states left on queue.
+Progress(41) at 2024-11-06 22:40:00: 12,574,495,559 states generated (29,230,228 s/min), 1,079,123,856 distinct states found (2,243,062 ds/min), 182,812,322 states left on queue.
+Progress(41) at 2024-11-06 22:41:00: 12,603,757,387 states generated (29,261,828 s/min), 1,081,610,769 distinct states found (2,486,913 ds/min), 183,219,247 states left on queue.
+Progress(41) at 2024-11-06 22:42:00: 12,632,909,026 states generated (29,151,639 s/min), 1,083,967,637 distinct states found (2,356,868 ds/min), 183,478,879 states left on queue.
+Progress(41) at 2024-11-06 22:43:00: 12,662,254,981 states generated (29,345,955 s/min), 1,086,272,935 distinct states found (2,305,298 ds/min), 183,726,701 states left on queue.
+Progress(41) at 2024-11-06 22:44:00: 12,691,400,218 states generated (29,145,237 s/min), 1,088,778,928 distinct states found (2,505,993 ds/min), 184,128,274 states left on queue.
+Progress(41) at 2024-11-06 22:45:00: 12,720,528,098 states generated (29,127,880 s/min), 1,091,335,929 distinct states found (2,557,001 ds/min), 184,556,078 states left on queue.
+Progress(41) at 2024-11-06 22:46:00: 12,749,701,886 states generated (29,173,788 s/min), 1,093,889,510 distinct states found (2,553,581 ds/min), 184,916,391 states left on queue.
+Progress(41) at 2024-11-06 22:47:00: 12,779,153,937 states generated (29,452,051 s/min), 1,096,185,973 distinct states found (2,296,463 ds/min), 185,115,877 states left on queue.
+Progress(41) at 2024-11-06 22:48:00: 12,808,440,971 states generated (29,287,034 s/min), 1,098,733,865 distinct states found (2,547,892 ds/min), 185,564,617 states left on queue.
+Progress(41) at 2024-11-06 22:49:00: 12,837,695,256 states generated (29,254,285 s/min), 1,100,705,460 distinct states found (1,971,595 ds/min), 185,532,558 states left on queue.
+Progress(41) at 2024-11-06 22:50:00: 12,866,801,129 states generated (29,105,873 s/min), 1,103,074,603 distinct states found (2,369,143 ds/min), 185,770,427 states left on queue.
+Progress(41) at 2024-11-06 22:51:00: 12,895,682,870 states generated (28,881,741 s/min), 1,105,437,747 distinct states found (2,363,144 ds/min), 186,049,274 states left on queue.
+Progress(41) at 2024-11-06 22:52:00: 12,924,655,990 states generated (28,973,120 s/min), 1,107,853,554 distinct states found (2,415,807 ds/min), 186,325,129 states left on queue.
+Progress(41) at 2024-11-06 22:53:00: 12,953,616,826 states generated (28,960,836 s/min), 1,110,097,321 distinct states found (2,243,767 ds/min), 186,509,276 states left on queue.
+Progress(41) at 2024-11-06 22:54:00: 12,982,711,068 states generated (29,094,242 s/min), 1,112,146,097 distinct states found (2,048,776 ds/min), 186,507,356 states left on queue.
+Progress(41) at 2024-11-06 22:55:00: 13,011,962,667 states generated (29,251,599 s/min), 1,114,530,785 distinct states found (2,384,688 ds/min), 186,758,016 states left on queue.
+Progress(41) at 2024-11-06 22:56:00: 13,041,163,382 states generated (29,200,715 s/min), 1,116,566,038 distinct states found (2,035,253 ds/min), 186,702,453 states left on queue.
+Progress(41) at 2024-11-06 22:57:00: 13,070,416,604 states generated (29,253,222 s/min), 1,118,433,735 distinct states found (1,867,697 ds/min), 186,595,926 states left on queue.
+Progress(41) at 2024-11-06 22:58:00: 13,099,393,765 states generated (28,977,161 s/min), 1,120,727,626 distinct states found (2,293,891 ds/min), 186,785,521 states left on queue.
+Progress(41) at 2024-11-06 22:59:00: 13,128,309,003 states generated (28,915,238 s/min), 1,123,075,278 distinct states found (2,347,652 ds/min), 186,977,496 states left on queue.
+Progress(42) at 2024-11-06 23:00:00: 13,157,492,254 states generated (29,183,251 s/min), 1,125,164,050 distinct states found (2,088,772 ds/min), 186,994,591 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 23:01:01)
+Progress(42) at 2024-11-06 23:01:01: 13,187,099,442 states generated (29,607,188 s/min), 1,126,955,828 distinct states found (1,791,778 ds/min), 186,860,457 states left on queue.
+Progress(42) at 2024-11-06 23:02:01: 13,216,408,249 states generated (29,308,807 s/min), 1,128,852,586 distinct states found (1,896,758 ds/min), 186,800,218 states left on queue.
+Progress(42) at 2024-11-06 23:03:01: 13,245,736,139 states generated (29,327,890 s/min), 1,130,960,381 distinct states found (2,107,795 ds/min), 186,911,118 states left on queue.
+Progress(42) at 2024-11-06 23:04:01: 13,274,893,464 states generated (29,157,325 s/min), 1,132,863,930 distinct states found (1,903,549 ds/min), 186,892,129 states left on queue.
+Progress(42) at 2024-11-06 23:05:01: 13,304,183,990 states generated (29,290,526 s/min), 1,134,876,306 distinct states found (2,012,376 ds/min), 186,965,153 states left on queue.
+Progress(42) at 2024-11-06 23:06:01: 13,333,457,770 states generated (29,273,780 s/min), 1,136,812,237 distinct states found (1,935,931 ds/min), 186,957,506 states left on queue.
+Progress(42) at 2024-11-06 23:07:01: 13,362,984,994 states generated (29,527,224 s/min), 1,138,649,876 distinct states found (1,837,639 ds/min), 186,823,887 states left on queue.
+Progress(42) at 2024-11-06 23:08:01: 13,392,550,733 states generated (29,565,739 s/min), 1,140,795,722 distinct states found (2,145,846 ds/min), 186,974,795 states left on queue.
+Progress(42) at 2024-11-06 23:09:01: 13,422,111,300 states generated (29,560,567 s/min), 1,143,038,611 distinct states found (2,242,889 ds/min), 187,179,197 states left on queue.
+Progress(42) at 2024-11-06 23:10:01: 13,451,822,496 states generated (29,711,196 s/min), 1,145,071,502 distinct states found (2,032,891 ds/min), 187,190,480 states left on queue.
+Progress(42) at 2024-11-06 23:11:01: 13,481,293,484 states generated (29,470,988 s/min), 1,146,905,806 distinct states found (1,834,304 ds/min), 187,079,661 states left on queue.
+Progress(42) at 2024-11-06 23:12:01: 13,510,659,679 states generated (29,366,195 s/min), 1,148,841,643 distinct states found (1,935,837 ds/min), 187,082,815 states left on queue.
+Progress(42) at 2024-11-06 23:13:01: 13,539,730,883 states generated (29,071,204 s/min), 1,150,715,436 distinct states found (1,873,793 ds/min), 187,013,975 states left on queue.
+Progress(42) at 2024-11-06 23:14:01: 13,568,973,308 states generated (29,242,425 s/min), 1,152,689,735 distinct states found (1,974,299 ds/min), 187,016,208 states left on queue.
+Progress(42) at 2024-11-06 23:15:01: 13,598,106,627 states generated (29,133,319 s/min), 1,154,829,869 distinct states found (2,140,134 ds/min), 187,147,884 states left on queue.
+Progress(42) at 2024-11-06 23:16:01: 13,627,319,459 states generated (29,212,832 s/min), 1,156,740,070 distinct states found (1,910,201 ds/min), 187,086,942 states left on queue.
+Progress(42) at 2024-11-06 23:17:01: 13,656,462,121 states generated (29,142,662 s/min), 1,158,698,307 distinct states found (1,958,237 ds/min), 187,072,201 states left on queue.
+Progress(42) at 2024-11-06 23:18:01: 13,685,545,941 states generated (29,083,820 s/min), 1,160,688,939 distinct states found (1,990,632 ds/min), 187,078,553 states left on queue.
+Progress(42) at 2024-11-06 23:19:01: 13,714,652,628 states generated (29,106,687 s/min), 1,162,748,633 distinct states found (2,059,694 ds/min), 187,157,229 states left on queue.
+Progress(42) at 2024-11-06 23:20:01: 13,744,105,986 states generated (29,453,358 s/min), 1,164,748,782 distinct states found (2,000,149 ds/min), 187,275,480 states left on queue.
+Progress(42) at 2024-11-06 23:21:01: 13,773,414,393 states generated (29,308,407 s/min), 1,166,804,740 distinct states found (2,055,958 ds/min), 187,393,312 states left on queue.
+Progress(42) at 2024-11-06 23:22:01: 13,802,600,069 states generated (29,185,676 s/min), 1,169,251,493 distinct states found (2,446,753 ds/min), 187,781,298 states left on queue.
+Progress(42) at 2024-11-06 23:23:01: 13,831,830,649 states generated (29,230,580 s/min), 1,171,412,176 distinct states found (2,160,683 ds/min), 187,932,991 states left on queue.
+Progress(42) at 2024-11-06 23:24:01: 13,861,152,221 states generated (29,321,572 s/min), 1,173,582,994 distinct states found (2,170,818 ds/min), 188,078,037 states left on queue.
+Progress(42) at 2024-11-06 23:25:01: 13,890,538,756 states generated (29,386,535 s/min), 1,175,642,901 distinct states found (2,059,907 ds/min), 188,116,794 states left on queue.
+Progress(42) at 2024-11-06 23:26:01: 13,919,812,820 states generated (29,274,064 s/min), 1,177,743,048 distinct states found (2,100,147 ds/min), 188,189,399 states left on queue.
+Progress(42) at 2024-11-06 23:27:01: 13,948,903,585 states generated (29,090,765 s/min), 1,179,980,470 distinct states found (2,237,422 ds/min), 188,388,309 states left on queue.
+Progress(42) at 2024-11-06 23:28:01: 13,978,138,385 states generated (29,234,800 s/min), 1,182,134,981 distinct states found (2,154,511 ds/min), 188,526,735 states left on queue.
+Progress(42) at 2024-11-06 23:29:01: 14,007,310,151 states generated (29,171,766 s/min), 1,184,360,360 distinct states found (2,225,379 ds/min), 188,718,575 states left on queue.
+Progress(42) at 2024-11-06 23:30:01: 14,036,411,110 states generated (29,100,959 s/min), 1,186,617,835 distinct states found (2,257,475 ds/min), 188,941,068 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 23:31:01)
+Progress(42) at 2024-11-06 23:31:01: 14,065,894,113 states generated (29,483,003 s/min), 1,188,743,048 distinct states found (2,125,213 ds/min), 189,035,636 states left on queue.
+Progress(42) at 2024-11-06 23:32:01: 14,094,909,096 states generated (29,014,983 s/min), 1,191,096,961 distinct states found (2,353,913 ds/min), 189,332,174 states left on queue.
+Progress(42) at 2024-11-06 23:33:01: 14,124,212,567 states generated (29,303,471 s/min), 1,193,012,997 distinct states found (1,916,036 ds/min), 189,266,016 states left on queue.
+Progress(42) at 2024-11-06 23:34:01: 14,153,428,768 states generated (29,216,201 s/min), 1,195,170,448 distinct states found (2,157,451 ds/min), 189,430,881 states left on queue.
+Progress(42) at 2024-11-06 23:35:01: 14,182,568,290 states generated (29,139,522 s/min), 1,197,127,126 distinct states found (1,956,678 ds/min), 189,423,769 states left on queue.
+Progress(42) at 2024-11-06 23:36:01: 14,211,602,024 states generated (29,033,734 s/min), 1,199,044,612 distinct states found (1,917,486 ds/min), 189,380,199 states left on queue.
+Progress(42) at 2024-11-06 23:37:01: 14,240,593,845 states generated (28,991,821 s/min), 1,200,900,028 distinct states found (1,855,416 ds/min), 189,324,925 states left on queue.
+Progress(42) at 2024-11-06 23:38:01: 14,269,687,808 states generated (29,093,963 s/min), 1,203,034,598 distinct states found (2,134,570 ds/min), 189,466,947 states left on queue.
+Progress(42) at 2024-11-06 23:39:01: 14,298,626,140 states generated (28,938,332 s/min), 1,205,190,806 distinct states found (2,156,208 ds/min), 189,608,794 states left on queue.
+Progress(42) at 2024-11-06 23:40:01: 14,327,587,116 states generated (28,960,976 s/min), 1,207,339,559 distinct states found (2,148,753 ds/min), 189,750,359 states left on queue.
+Progress(42) at 2024-11-06 23:41:01: 14,356,469,494 states generated (28,882,378 s/min), 1,209,518,146 distinct states found (2,178,587 ds/min), 189,892,036 states left on queue.
+Progress(42) at 2024-11-06 23:42:01: 14,385,314,696 states generated (28,845,202 s/min), 1,211,701,473 distinct states found (2,183,327 ds/min), 190,050,090 states left on queue.
+Progress(42) at 2024-11-06 23:43:01: 14,414,142,550 states generated (28,827,854 s/min), 1,213,859,919 distinct states found (2,158,446 ds/min), 190,161,804 states left on queue.
+Progress(42) at 2024-11-06 23:44:01: 14,442,945,644 states generated (28,803,094 s/min), 1,216,005,127 distinct states found (2,145,208 ds/min), 190,173,898 states left on queue.
+Progress(42) at 2024-11-06 23:45:01: 14,471,693,798 states generated (28,748,154 s/min), 1,218,030,292 distinct states found (2,025,165 ds/min), 190,127,864 states left on queue.
+Progress(42) at 2024-11-06 23:46:01: 14,500,599,025 states generated (28,905,227 s/min), 1,219,996,243 distinct states found (1,965,951 ds/min), 190,069,034 states left on queue.
+Progress(42) at 2024-11-06 23:47:01: 14,529,770,118 states generated (29,171,093 s/min), 1,221,890,284 distinct states found (1,894,041 ds/min), 189,948,701 states left on queue.
+Progress(42) at 2024-11-06 23:48:01: 14,559,044,399 states generated (29,274,281 s/min), 1,223,772,100 distinct states found (1,881,816 ds/min), 189,844,417 states left on queue.
+Progress(42) at 2024-11-06 23:49:01: 14,588,505,088 states generated (29,460,689 s/min), 1,225,870,790 distinct states found (2,098,690 ds/min), 189,921,025 states left on queue.
+Progress(42) at 2024-11-06 23:50:01: 14,618,007,797 states generated (29,502,709 s/min), 1,227,944,381 distinct states found (2,073,591 ds/min), 189,947,590 states left on queue.
+Progress(42) at 2024-11-06 23:51:01: 14,647,405,532 states generated (29,397,735 s/min), 1,230,287,712 distinct states found (2,343,331 ds/min), 190,200,223 states left on queue.
+Progress(42) at 2024-11-06 23:52:01: 14,676,733,478 states generated (29,327,946 s/min), 1,232,303,440 distinct states found (2,015,728 ds/min), 190,178,290 states left on queue.
+Progress(42) at 2024-11-06 23:53:01: 14,706,089,483 states generated (29,356,005 s/min), 1,234,269,055 distinct states found (1,965,615 ds/min), 190,175,215 states left on queue.
+Progress(42) at 2024-11-06 23:54:01: 14,735,226,809 states generated (29,137,326 s/min), 1,236,451,189 distinct states found (2,182,134 ds/min), 190,293,853 states left on queue.
+Progress(42) at 2024-11-06 23:55:01: 14,764,611,146 states generated (29,384,337 s/min), 1,238,780,557 distinct states found (2,329,368 ds/min), 190,528,991 states left on queue.
+Progress(42) at 2024-11-06 23:56:01: 14,793,911,038 states generated (29,299,892 s/min), 1,240,745,156 distinct states found (1,964,599 ds/min), 190,493,881 states left on queue.
+Progress(42) at 2024-11-06 23:57:01: 14,823,113,635 states generated (29,202,597 s/min), 1,242,984,781 distinct states found (2,239,625 ds/min), 190,675,723 states left on queue.
+Progress(42) at 2024-11-06 23:58:01: 14,852,208,056 states generated (29,094,421 s/min), 1,245,341,804 distinct states found (2,357,023 ds/min), 190,959,027 states left on queue.
+Progress(42) at 2024-11-06 23:59:01: 14,881,390,523 states generated (29,182,467 s/min), 1,247,530,823 distinct states found (2,189,019 ds/min), 191,085,175 states left on queue.
+Progress(42) at 2024-11-07 00:00:01: 14,910,709,837 states generated (29,319,314 s/min), 1,249,665,632 distinct states found (2,134,809 ds/min), 191,148,911 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 00:01:02)
+Progress(42) at 2024-11-07 00:01:02: 14,940,301,722 states generated (29,591,885 s/min), 1,251,820,098 distinct states found (2,154,466 ds/min), 191,164,099 states left on queue.
+Progress(42) at 2024-11-07 00:02:02: 14,969,468,946 states generated (29,167,224 s/min), 1,253,608,374 distinct states found (1,788,276 ds/min), 190,977,899 states left on queue.
+Progress(42) at 2024-11-07 00:03:02: 14,998,469,861 states generated (29,000,915 s/min), 1,255,846,206 distinct states found (2,237,832 ds/min), 191,179,932 states left on queue.
+Progress(42) at 2024-11-07 00:04:02: 15,027,424,344 states generated (28,954,483 s/min), 1,258,012,253 distinct states found (2,166,047 ds/min), 191,269,006 states left on queue.
+Progress(42) at 2024-11-07 00:05:02: 15,056,595,053 states generated (29,170,709 s/min), 1,259,974,817 distinct states found (1,962,564 ds/min), 191,232,379 states left on queue.
+Progress(42) at 2024-11-07 00:06:02: 15,085,857,792 states generated (29,262,739 s/min), 1,262,139,752 distinct states found (2,164,935 ds/min), 191,351,326 states left on queue.
+Progress(42) at 2024-11-07 00:07:02: 15,115,386,019 states generated (29,528,227 s/min), 1,264,425,723 distinct states found (2,285,971 ds/min), 191,549,077 states left on queue.
+Progress(42) at 2024-11-07 00:08:02: 15,144,705,784 states generated (29,319,765 s/min), 1,266,390,816 distinct states found (1,965,093 ds/min), 191,495,454 states left on queue.
+Progress(42) at 2024-11-07 00:09:02: 15,173,877,454 states generated (29,171,670 s/min), 1,268,144,487 distinct states found (1,753,671 ds/min), 191,300,959 states left on queue.
+Progress(42) at 2024-11-07 00:10:02: 15,203,080,845 states generated (29,203,391 s/min), 1,270,256,870 distinct states found (2,112,383 ds/min), 191,363,085 states left on queue.
+Progress(42) at 2024-11-07 00:11:02: 15,232,426,418 states generated (29,345,573 s/min), 1,272,624,413 distinct states found (2,367,543 ds/min), 191,673,032 states left on queue.
+Progress(42) at 2024-11-07 00:12:02: 15,261,677,209 states generated (29,250,791 s/min), 1,274,995,857 distinct states found (2,371,444 ds/min), 191,960,618 states left on queue.
+Progress(42) at 2024-11-07 00:13:02: 15,290,882,314 states generated (29,205,105 s/min), 1,277,269,501 distinct states found (2,273,644 ds/min), 192,155,220 states left on queue.
+Progress(42) at 2024-11-07 00:14:02: 15,320,166,816 states generated (29,284,502 s/min), 1,279,524,897 distinct states found (2,255,396 ds/min), 192,367,797 states left on queue.
+Progress(42) at 2024-11-07 00:15:02: 15,349,391,017 states generated (29,224,201 s/min), 1,281,912,896 distinct states found (2,387,999 ds/min), 192,657,361 states left on queue.
+Progress(42) at 2024-11-07 00:16:02: 15,378,510,873 states generated (29,119,856 s/min), 1,284,352,819 distinct states found (2,439,923 ds/min), 192,982,001 states left on queue.
+Progress(42) at 2024-11-07 00:17:02: 15,407,729,690 states generated (29,218,817 s/min), 1,286,798,116 distinct states found (2,445,297 ds/min), 193,251,888 states left on queue.
+Progress(42) at 2024-11-07 00:18:02: 15,437,122,682 states generated (29,392,992 s/min), 1,289,060,398 distinct states found (2,262,282 ds/min), 193,393,686 states left on queue.
+Progress(42) at 2024-11-07 00:19:02: 15,466,437,919 states generated (29,315,237 s/min), 1,291,390,007 distinct states found (2,329,609 ds/min), 193,674,611 states left on queue.
+Progress(42) at 2024-11-07 00:20:02: 15,495,795,434 states generated (29,357,515 s/min), 1,293,625,999 distinct states found (2,235,992 ds/min), 193,855,148 states left on queue.
+Progress(42) at 2024-11-07 00:21:02: 15,524,856,146 states generated (29,060,712 s/min), 1,295,675,220 distinct states found (2,049,221 ds/min), 193,858,347 states left on queue.
+Progress(42) at 2024-11-07 00:22:02: 15,553,951,279 states generated (29,095,133 s/min), 1,297,806,219 distinct states found (2,130,999 ds/min), 193,910,330 states left on queue.
+Progress(42) at 2024-11-07 00:23:02: 15,582,781,229 states generated (28,829,950 s/min), 1,300,215,254 distinct states found (2,409,035 ds/min), 194,211,020 states left on queue.
+Progress(42) at 2024-11-07 00:24:02: 15,611,889,872 states generated (29,108,643 s/min), 1,302,431,347 distinct states found (2,216,093 ds/min), 194,324,070 states left on queue.
+Progress(42) at 2024-11-07 00:25:02: 15,640,778,210 states generated (28,888,338 s/min), 1,304,674,839 distinct states found (2,243,492 ds/min), 194,483,563 states left on queue.
+Progress(42) at 2024-11-07 00:26:02: 15,669,830,004 states generated (29,051,794 s/min), 1,306,661,103 distinct states found (1,986,264 ds/min), 194,429,101 states left on queue.
+Progress(42) at 2024-11-07 00:27:02: 15,699,049,213 states generated (29,219,209 s/min), 1,308,920,712 distinct states found (2,259,609 ds/min), 194,577,576 states left on queue.
+Progress(42) at 2024-11-07 00:28:02: 15,728,283,982 states generated (29,234,769 s/min), 1,310,924,780 distinct states found (2,004,068 ds/min), 194,488,601 states left on queue.
+Progress(42) at 2024-11-07 00:29:02: 15,757,507,793 states generated (29,223,811 s/min), 1,312,729,390 distinct states found (1,804,610 ds/min), 194,321,454 states left on queue.
+Progress(42) at 2024-11-07 00:30:02: 15,786,513,733 states generated (29,005,940 s/min), 1,314,926,573 distinct states found (2,197,183 ds/min), 194,422,995 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 00:31:03)
+Progress(42) at 2024-11-07 00:31:03: 15,815,683,048 states generated (29,169,315 s/min), 1,317,135,461 distinct states found (2,208,888 ds/min), 194,492,192 states left on queue.
+Progress(42) at 2024-11-07 00:32:03: 15,844,758,678 states generated (29,075,630 s/min), 1,319,144,875 distinct states found (2,009,414 ds/min), 194,413,387 states left on queue.
+Progress(42) at 2024-11-07 00:33:03: 15,873,998,157 states generated (29,239,479 s/min), 1,320,932,025 distinct states found (1,787,150 ds/min), 194,281,981 states left on queue.
+Progress(42) at 2024-11-07 00:34:03: 15,903,205,479 states generated (29,207,322 s/min), 1,322,654,400 distinct states found (1,722,375 ds/min), 194,091,121 states left on queue.
+Progress(42) at 2024-11-07 00:35:03: 15,932,501,264 states generated (29,295,785 s/min), 1,324,682,430 distinct states found (2,028,030 ds/min), 194,137,494 states left on queue.
+Progress(42) at 2024-11-07 00:36:03: 15,961,589,919 states generated (29,088,655 s/min), 1,326,509,334 distinct states found (1,826,904 ds/min), 194,051,639 states left on queue.
+Progress(42) at 2024-11-07 00:37:03: 15,990,668,327 states generated (29,078,408 s/min), 1,328,357,672 distinct states found (1,848,338 ds/min), 193,989,585 states left on queue.
+Progress(42) at 2024-11-07 00:38:03: 16,019,782,313 states generated (29,113,986 s/min), 1,330,232,446 distinct states found (1,874,774 ds/min), 193,949,446 states left on queue.
+Progress(42) at 2024-11-07 00:39:03: 16,049,252,200 states generated (29,469,887 s/min), 1,331,987,412 distinct states found (1,754,966 ds/min), 193,747,896 states left on queue.
+Progress(42) at 2024-11-07 00:40:03: 16,078,692,514 states generated (29,440,314 s/min), 1,333,894,185 distinct states found (1,906,773 ds/min), 193,729,942 states left on queue.
+Progress(42) at 2024-11-07 00:41:03: 16,108,160,136 states generated (29,467,622 s/min), 1,336,102,661 distinct states found (2,208,476 ds/min), 193,914,624 states left on queue.
+Progress(42) at 2024-11-07 00:42:03: 16,137,813,382 states generated (29,653,246 s/min), 1,338,180,836 distinct states found (2,078,175 ds/min), 193,976,996 states left on queue.
+Progress(43) at 2024-11-07 00:43:03: 16,167,357,885 states generated (29,544,503 s/min), 1,339,957,139 distinct states found (1,776,303 ds/min), 193,787,392 states left on queue.
+Progress(43) at 2024-11-07 00:44:03: 16,196,650,450 states generated (29,292,565 s/min), 1,341,719,088 distinct states found (1,761,949 ds/min), 193,648,551 states left on queue.
+Progress(43) at 2024-11-07 00:45:03: 16,225,735,286 states generated (29,084,836 s/min), 1,343,468,127 distinct states found (1,749,039 ds/min), 193,497,590 states left on queue.
+Progress(43) at 2024-11-07 00:46:03: 16,254,805,612 states generated (29,070,326 s/min), 1,345,280,226 distinct states found (1,812,099 ds/min), 193,364,788 states left on queue.
+Progress(43) at 2024-11-07 00:47:03: 16,283,933,423 states generated (29,127,811 s/min), 1,347,294,879 distinct states found (2,014,653 ds/min), 193,397,713 states left on queue.
+Progress(43) at 2024-11-07 00:48:03: 16,312,911,730 states generated (28,978,307 s/min), 1,349,192,377 distinct states found (1,897,498 ds/min), 193,321,503 states left on queue.
+Progress(43) at 2024-11-07 00:49:03: 16,342,115,657 states generated (29,203,927 s/min), 1,350,961,684 distinct states found (1,769,307 ds/min), 193,144,596 states left on queue.
+Progress(43) at 2024-11-07 00:50:03: 16,370,988,391 states generated (28,872,734 s/min), 1,352,868,904 distinct states found (1,907,220 ds/min), 193,089,969 states left on queue.
+Progress(43) at 2024-11-07 00:51:03: 16,400,089,208 states generated (29,100,817 s/min), 1,354,864,448 distinct states found (1,995,544 ds/min), 193,098,377 states left on queue.
+Progress(43) at 2024-11-07 00:52:03: 16,429,331,456 states generated (29,242,248 s/min), 1,356,734,632 distinct states found (1,870,184 ds/min), 193,093,615 states left on queue.
+Progress(43) at 2024-11-07 00:53:03: 16,458,648,761 states generated (29,317,305 s/min), 1,358,622,917 distinct states found (1,888,285 ds/min), 193,098,172 states left on queue.
+Progress(43) at 2024-11-07 00:54:03: 16,487,874,773 states generated (29,226,012 s/min), 1,360,737,908 distinct states found (2,114,991 ds/min), 193,250,949 states left on queue.
+Progress(43) at 2024-11-07 00:55:03: 16,517,101,401 states generated (29,226,628 s/min), 1,363,024,072 distinct states found (2,286,164 ds/min), 193,508,719 states left on queue.
+Progress(43) at 2024-11-07 00:56:03: 16,546,231,362 states generated (29,129,961 s/min), 1,365,056,771 distinct states found (2,032,699 ds/min), 193,558,441 states left on queue.
+Progress(43) at 2024-11-07 00:57:03: 16,575,532,837 states generated (29,301,475 s/min), 1,367,107,709 distinct states found (2,050,938 ds/min), 193,609,354 states left on queue.
+Progress(43) at 2024-11-07 00:58:03: 16,604,872,137 states generated (29,339,300 s/min), 1,369,059,417 distinct states found (1,951,708 ds/min), 193,561,420 states left on queue.
+Progress(43) at 2024-11-07 00:59:03: 16,634,070,732 states generated (29,198,595 s/min), 1,371,016,928 distinct states found (1,957,511 ds/min), 193,513,278 states left on queue.
+Progress(43) at 2024-11-07 01:00:03: 16,663,158,113 states generated (29,087,381 s/min), 1,373,092,542 distinct states found (2,075,614 ds/min), 193,582,661 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 01:01:03)
+Progress(43) at 2024-11-07 01:01:03: 16,692,576,110 states generated (29,417,997 s/min), 1,375,200,108 distinct states found (2,107,566 ds/min), 193,664,621 states left on queue.
+Progress(43) at 2024-11-07 01:02:03: 16,721,716,479 states generated (29,140,369 s/min), 1,377,247,529 distinct states found (2,047,421 ds/min), 193,708,538 states left on queue.
+Progress(43) at 2024-11-07 01:03:03: 16,750,779,523 states generated (29,063,044 s/min), 1,379,368,087 distinct states found (2,120,558 ds/min), 193,813,065 states left on queue.
+Progress(43) at 2024-11-07 01:04:03: 16,779,794,524 states generated (29,015,001 s/min), 1,381,371,287 distinct states found (2,003,200 ds/min), 193,825,465 states left on queue.
+Progress(43) at 2024-11-07 01:05:03: 16,808,907,203 states generated (29,112,679 s/min), 1,383,515,008 distinct states found (2,143,721 ds/min), 193,953,821 states left on queue.
+Progress(43) at 2024-11-07 01:06:03: 16,838,029,628 states generated (29,122,425 s/min), 1,385,629,882 distinct states found (2,114,874 ds/min), 194,038,163 states left on queue.
+Progress(43) at 2024-11-07 01:07:03: 16,867,418,111 states generated (29,388,483 s/min), 1,387,561,049 distinct states found (1,931,167 ds/min), 194,004,058 states left on queue.
+Progress(43) at 2024-11-07 01:08:03: 16,896,555,416 states generated (29,137,305 s/min), 1,389,592,238 distinct states found (2,031,189 ds/min), 194,058,208 states left on queue.
+Progress(43) at 2024-11-07 01:09:03: 16,925,642,685 states generated (29,087,269 s/min), 1,391,404,896 distinct states found (1,812,658 ds/min), 193,924,951 states left on queue.
+Progress(43) at 2024-11-07 01:10:03: 16,954,638,533 states generated (28,995,848 s/min), 1,393,186,525 distinct states found (1,781,629 ds/min), 193,784,358 states left on queue.
+Progress(43) at 2024-11-07 01:11:03: 16,983,710,894 states generated (29,072,361 s/min), 1,395,018,264 distinct states found (1,831,739 ds/min), 193,697,690 states left on queue.
+Progress(43) at 2024-11-07 01:12:03: 17,012,741,316 states generated (29,030,422 s/min), 1,397,039,325 distinct states found (2,021,061 ds/min), 193,755,919 states left on queue.
+Progress(43) at 2024-11-07 01:13:03: 17,041,674,538 states generated (28,933,222 s/min), 1,399,086,352 distinct states found (2,047,027 ds/min), 193,799,420 states left on queue.
+Progress(43) at 2024-11-07 01:14:03: 17,070,653,912 states generated (28,979,374 s/min), 1,401,092,312 distinct states found (2,005,960 ds/min), 193,820,018 states left on queue.
+Progress(43) at 2024-11-07 01:15:03: 17,099,536,446 states generated (28,882,534 s/min), 1,403,159,743 distinct states found (2,067,431 ds/min), 193,867,947 states left on queue.
+Progress(43) at 2024-11-07 01:16:03: 17,128,396,670 states generated (28,860,224 s/min), 1,405,244,280 distinct states found (2,084,537 ds/min), 193,945,380 states left on queue.
+Progress(43) at 2024-11-07 01:17:03: 17,157,276,177 states generated (28,879,507 s/min), 1,407,274,748 distinct states found (2,030,468 ds/min), 193,944,077 states left on queue.
+Progress(43) at 2024-11-07 01:18:03: 17,186,149,639 states generated (28,873,462 s/min), 1,409,283,088 distinct states found (2,008,340 ds/min), 193,881,792 states left on queue.
+Progress(43) at 2024-11-07 01:19:03: 17,214,923,206 states generated (28,773,567 s/min), 1,411,167,065 distinct states found (1,883,977 ds/min), 193,711,394 states left on queue.
+Progress(43) at 2024-11-07 01:20:03: 17,243,730,245 states generated (28,807,039 s/min), 1,413,023,763 distinct states found (1,856,698 ds/min), 193,546,054 states left on queue.
+Progress(43) at 2024-11-07 01:21:03: 17,272,650,525 states generated (28,920,280 s/min), 1,414,802,171 distinct states found (1,778,408 ds/min), 193,345,308 states left on queue.
+Progress(43) at 2024-11-07 01:22:03: 17,301,943,589 states generated (29,293,064 s/min), 1,416,599,440 distinct states found (1,797,269 ds/min), 193,158,676 states left on queue.
+Progress(43) at 2024-11-07 01:23:03: 17,331,337,313 states generated (29,393,724 s/min), 1,418,547,450 distinct states found (1,948,010 ds/min), 193,112,883 states left on queue.
+Progress(43) at 2024-11-07 01:24:03: 17,360,793,100 states generated (29,455,787 s/min), 1,420,576,018 distinct states found (2,028,568 ds/min), 193,100,476 states left on queue.
+Progress(43) at 2024-11-07 01:25:03: 17,390,123,392 states generated (29,330,292 s/min), 1,422,693,479 distinct states found (2,117,461 ds/min), 193,171,748 states left on queue.
+Progress(43) at 2024-11-07 01:26:03: 17,419,468,515 states generated (29,345,123 s/min), 1,424,783,244 distinct states found (2,089,765 ds/min), 193,228,274 states left on queue.
+Progress(43) at 2024-11-07 01:27:03: 17,448,810,016 states generated (29,341,501 s/min), 1,426,560,811 distinct states found (1,777,567 ds/min), 193,036,459 states left on queue.
+Progress(43) at 2024-11-07 01:28:03: 17,478,034,472 states generated (29,224,456 s/min), 1,428,663,374 distinct states found (2,102,563 ds/min), 193,125,616 states left on queue.
+Progress(43) at 2024-11-07 01:29:03: 17,507,201,835 states generated (29,167,363 s/min), 1,430,735,910 distinct states found (2,072,536 ds/min), 193,146,850 states left on queue.
+Progress(43) at 2024-11-07 01:30:03: 17,536,546,498 states generated (29,344,663 s/min), 1,432,877,950 distinct states found (2,142,040 ds/min), 193,230,645 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 01:31:04)
+Progress(43) at 2024-11-07 01:31:04: 17,566,061,546 states generated (29,515,048 s/min), 1,434,839,708 distinct states found (1,961,758 ds/min), 193,176,951 states left on queue.
+Progress(43) at 2024-11-07 01:32:04: 17,595,015,993 states generated (28,954,447 s/min), 1,436,986,257 distinct states found (2,146,549 ds/min), 193,289,254 states left on queue.
+Progress(43) at 2024-11-07 01:33:04: 17,624,137,153 states generated (29,121,160 s/min), 1,439,279,150 distinct states found (2,292,893 ds/min), 193,525,973 states left on queue.
+Progress(43) at 2024-11-07 01:34:04: 17,653,328,248 states generated (29,191,095 s/min), 1,441,299,767 distinct states found (2,020,617 ds/min), 193,504,947 states left on queue.
+Progress(43) at 2024-11-07 01:35:04: 17,682,562,562 states generated (29,234,314 s/min), 1,443,317,413 distinct states found (2,017,646 ds/min), 193,471,905 states left on queue.
+Progress(43) at 2024-11-07 01:36:04: 17,711,829,397 states generated (29,266,835 s/min), 1,445,304,310 distinct states found (1,986,897 ds/min), 193,370,899 states left on queue.
+Progress(43) at 2024-11-07 01:37:04: 17,740,910,347 states generated (29,080,950 s/min), 1,447,009,563 distinct states found (1,705,253 ds/min), 193,129,235 states left on queue.
+Progress(43) at 2024-11-07 01:38:04: 17,769,836,321 states generated (28,925,974 s/min), 1,449,139,496 distinct states found (2,129,933 ds/min), 193,234,511 states left on queue.
+Progress(43) at 2024-11-07 01:39:04: 17,798,713,067 states generated (28,876,746 s/min), 1,451,211,612 distinct states found (2,072,116 ds/min), 193,241,362 states left on queue.
+Progress(43) at 2024-11-07 01:40:04: 17,827,794,691 states generated (29,081,624 s/min), 1,453,062,046 distinct states found (1,850,434 ds/min), 193,114,753 states left on queue.
+Progress(43) at 2024-11-07 01:41:04: 17,856,974,014 states generated (29,179,323 s/min), 1,455,151,187 distinct states found (2,089,141 ds/min), 193,169,579 states left on queue.
+Progress(43) at 2024-11-07 01:42:04: 17,886,446,666 states generated (29,472,652 s/min), 1,457,303,171 distinct states found (2,151,984 ds/min), 193,263,708 states left on queue.
+Progress(43) at 2024-11-07 01:43:04: 17,915,744,840 states generated (29,298,174 s/min), 1,459,261,460 distinct states found (1,958,289 ds/min), 193,194,468 states left on queue.
+Progress(43) at 2024-11-07 01:44:04: 17,944,793,057 states generated (29,048,217 s/min), 1,460,885,305 distinct states found (1,623,845 ds/min), 192,905,330 states left on queue.
+Progress(43) at 2024-11-07 01:45:04: 17,973,952,967 states generated (29,159,910 s/min), 1,462,880,642 distinct states found (1,995,337 ds/min), 192,871,348 states left on queue.
+Progress(43) at 2024-11-07 01:46:04: 18,003,158,344 states generated (29,205,377 s/min), 1,465,077,846 distinct states found (2,197,204 ds/min), 193,039,702 states left on queue.
+Progress(43) at 2024-11-07 01:47:04: 18,032,464,087 states generated (29,305,743 s/min), 1,467,361,120 distinct states found (2,283,274 ds/min), 193,271,051 states left on queue.
+Progress(43) at 2024-11-07 01:48:04: 18,061,597,682 states generated (29,133,595 s/min), 1,469,505,688 distinct states found (2,144,568 ds/min), 193,354,360 states left on queue.
+Progress(43) at 2024-11-07 01:49:04: 18,090,888,515 states generated (29,290,833 s/min), 1,471,655,035 distinct states found (2,149,347 ds/min), 193,472,080 states left on queue.
+Progress(43) at 2024-11-07 01:50:04: 18,119,855,749 states generated (28,967,234 s/min), 1,473,959,147 distinct states found (2,304,112 ds/min), 193,714,821 states left on queue.
+Progress(43) at 2024-11-07 01:51:04: 18,149,035,954 states generated (29,180,205 s/min), 1,476,253,894 distinct states found (2,294,747 ds/min), 193,939,051 states left on queue.
+Progress(43) at 2024-11-07 01:52:04: 18,178,210,402 states generated (29,174,448 s/min), 1,478,557,699 distinct states found (2,303,805 ds/min), 194,141,809 states left on queue.
+Progress(43) at 2024-11-07 01:53:04: 18,207,377,534 states generated (29,167,132 s/min), 1,480,870,404 distinct states found (2,312,705 ds/min), 194,307,877 states left on queue.
+Progress(43) at 2024-11-07 01:54:04: 18,236,577,989 states generated (29,200,455 s/min), 1,483,070,823 distinct states found (2,200,419 ds/min), 194,387,223 states left on queue.
+Progress(43) at 2024-11-07 01:55:04: 18,265,859,163 states generated (29,281,174 s/min), 1,485,222,154 distinct states found (2,151,331 ds/min), 194,522,233 states left on queue.
+Progress(43) at 2024-11-07 01:56:04: 18,295,148,797 states generated (29,289,634 s/min), 1,487,521,283 distinct states found (2,299,129 ds/min), 194,755,427 states left on queue.
+Progress(43) at 2024-11-07 01:57:04: 18,324,289,175 states generated (29,140,378 s/min), 1,489,367,193 distinct states found (1,845,910 ds/min), 194,604,366 states left on queue.
+Progress(43) at 2024-11-07 01:58:04: 18,353,385,770 states generated (29,096,595 s/min), 1,491,503,782 distinct states found (2,136,589 ds/min), 194,651,670 states left on queue.
+Progress(43) at 2024-11-07 01:59:04: 18,382,277,307 states generated (28,891,537 s/min), 1,493,659,362 distinct states found (2,155,580 ds/min), 194,761,640 states left on queue.
+Progress(43) at 2024-11-07 02:00:04: 18,411,146,853 states generated (28,869,546 s/min), 1,495,935,237 distinct states found (2,275,875 ds/min), 194,908,896 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 02:01:05)
+Progress(43) at 2024-11-07 02:01:05: 18,440,532,837 states generated (29,385,984 s/min), 1,497,937,249 distinct states found (2,002,012 ds/min), 194,874,096 states left on queue.
+Progress(43) at 2024-11-07 02:02:05: 18,469,385,511 states generated (28,852,674 s/min), 1,500,062,084 distinct states found (2,124,835 ds/min), 194,900,930 states left on queue.
+Progress(43) at 2024-11-07 02:03:05: 18,498,509,160 states generated (29,123,649 s/min), 1,502,077,387 distinct states found (2,015,303 ds/min), 194,871,250 states left on queue.
+Progress(43) at 2024-11-07 02:04:05: 18,527,694,520 states generated (29,185,360 s/min), 1,504,242,136 distinct states found (2,164,749 ds/min), 194,924,748 states left on queue.
+Progress(43) at 2024-11-07 02:05:05: 18,556,901,350 states generated (29,206,830 s/min), 1,506,034,687 distinct states found (1,792,551 ds/min), 194,670,609 states left on queue.
+Progress(43) at 2024-11-07 02:06:05: 18,586,004,706 states generated (29,103,356 s/min), 1,507,879,191 distinct states found (1,844,504 ds/min), 194,551,782 states left on queue.
+Progress(43) at 2024-11-07 02:07:05: 18,614,881,319 states generated (28,876,613 s/min), 1,510,019,997 distinct states found (2,140,806 ds/min), 194,594,957 states left on queue.
+Progress(43) at 2024-11-07 02:08:05: 18,643,854,322 states generated (28,973,003 s/min), 1,512,074,165 distinct states found (2,054,168 ds/min), 194,532,832 states left on queue.
+Progress(43) at 2024-11-07 02:09:05: 18,672,998,550 states generated (29,144,228 s/min), 1,513,943,120 distinct states found (1,868,955 ds/min), 194,368,599 states left on queue.
+Progress(43) at 2024-11-07 02:10:05: 18,702,201,308 states generated (29,202,758 s/min), 1,515,546,068 distinct states found (1,602,948 ds/min), 194,090,755 states left on queue.
+Progress(43) at 2024-11-07 02:11:05: 18,731,481,011 states generated (29,279,703 s/min), 1,517,343,788 distinct states found (1,797,720 ds/min), 193,942,961 states left on queue.
+Progress(43) at 2024-11-07 02:12:05: 18,760,609,986 states generated (29,128,975 s/min), 1,519,160,050 distinct states found (1,816,262 ds/min), 193,815,502 states left on queue.
+Progress(43) at 2024-11-07 02:13:05: 18,789,628,202 states generated (29,018,216 s/min), 1,520,860,123 distinct states found (1,700,073 ds/min), 193,642,399 states left on queue.
+Progress(43) at 2024-11-07 02:14:05: 18,818,770,407 states generated (29,142,205 s/min), 1,522,616,126 distinct states found (1,756,003 ds/min), 193,516,180 states left on queue.
+Progress(43) at 2024-11-07 02:15:05: 18,847,943,521 states generated (29,173,114 s/min), 1,524,373,878 distinct states found (1,757,752 ds/min), 193,352,389 states left on queue.
+Progress(43) at 2024-11-07 02:16:05: 18,877,338,814 states generated (29,395,293 s/min), 1,526,022,199 distinct states found (1,648,321 ds/min), 193,099,089 states left on queue.
+Progress(43) at 2024-11-07 02:17:05: 18,906,854,907 states generated (29,516,093 s/min), 1,528,057,287 distinct states found (2,035,088 ds/min), 193,164,007 states left on queue.
+Progress(43) at 2024-11-07 02:18:05: 18,936,272,714 states generated (29,417,807 s/min), 1,530,070,868 distinct states found (2,013,581 ds/min), 193,195,191 states left on queue.
+Progress(43) at 2024-11-07 02:19:05: 18,965,845,291 states generated (29,572,577 s/min), 1,531,953,514 distinct states found (1,882,646 ds/min), 193,094,610 states left on queue.
+Progress(44) at 2024-11-07 02:20:05: 18,995,225,711 states generated (29,380,420 s/min), 1,533,586,486 distinct states found (1,632,972 ds/min), 192,813,292 states left on queue.
+Progress(44) at 2024-11-07 02:21:05: 19,024,424,249 states generated (29,198,538 s/min), 1,535,341,846 distinct states found (1,755,360 ds/min), 192,665,431 states left on queue.
+Progress(44) at 2024-11-07 02:22:05: 19,053,319,611 states generated (28,895,362 s/min), 1,536,913,652 distinct states found (1,571,806 ds/min), 192,336,687 states left on queue.
+Progress(44) at 2024-11-07 02:23:05: 19,082,456,366 states generated (29,136,755 s/min), 1,538,781,638 distinct states found (1,867,986 ds/min), 192,258,068 states left on queue.
+Progress(44) at 2024-11-07 02:24:05: 19,111,445,941 states generated (28,989,575 s/min), 1,540,696,734 distinct states found (1,915,096 ds/min), 192,193,602 states left on queue.
+Progress(44) at 2024-11-07 02:25:05: 19,140,498,683 states generated (29,052,742 s/min), 1,542,368,994 distinct states found (1,672,260 ds/min), 191,938,239 states left on queue.
+Progress(44) at 2024-11-07 02:26:05: 19,169,386,645 states generated (28,887,962 s/min), 1,544,099,236 distinct states found (1,730,242 ds/min), 191,741,059 states left on queue.
+Progress(44) at 2024-11-07 02:27:05: 19,198,354,957 states generated (28,968,312 s/min), 1,545,891,836 distinct states found (1,792,600 ds/min), 191,577,211 states left on queue.
+Progress(44) at 2024-11-07 02:28:05: 19,227,551,398 states generated (29,196,441 s/min), 1,547,751,807 distinct states found (1,859,971 ds/min), 191,530,291 states left on queue.
+Progress(44) at 2024-11-07 02:29:05: 19,256,905,544 states generated (29,354,146 s/min), 1,549,562,753 distinct states found (1,810,946 ds/min), 191,492,536 states left on queue.
+Progress(44) at 2024-11-07 02:30:05: 19,286,043,009 states generated (29,137,465 s/min), 1,551,387,062 distinct states found (1,824,309 ds/min), 191,432,131 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 02:31:05)
+Progress(44) at 2024-11-07 02:31:05: 19,315,478,636 states generated (29,435,627 s/min), 1,553,684,416 distinct states found (2,297,354 ds/min), 191,685,387 states left on queue.
+Progress(44) at 2024-11-07 02:32:05: 19,344,574,433 states generated (29,095,797 s/min), 1,555,642,251 distinct states found (1,957,835 ds/min), 191,687,380 states left on queue.
+Progress(44) at 2024-11-07 02:33:05: 19,373,560,321 states generated (28,985,888 s/min), 1,557,576,032 distinct states found (1,933,781 ds/min), 191,644,771 states left on queue.
+Progress(44) at 2024-11-07 02:34:05: 19,402,882,849 states generated (29,322,528 s/min), 1,559,483,211 distinct states found (1,907,179 ds/min), 191,584,351 states left on queue.
+Progress(44) at 2024-11-07 02:35:05: 19,432,084,827 states generated (29,201,978 s/min), 1,561,305,888 distinct states found (1,822,677 ds/min), 191,432,596 states left on queue.
+Progress(44) at 2024-11-07 02:36:05: 19,461,112,335 states generated (29,027,508 s/min), 1,563,180,797 distinct states found (1,874,909 ds/min), 191,330,553 states left on queue.
+Progress(44) at 2024-11-07 02:37:05: 19,490,043,498 states generated (28,931,163 s/min), 1,565,137,368 distinct states found (1,956,571 ds/min), 191,292,333 states left on queue.
+Progress(44) at 2024-11-07 02:38:05: 19,519,153,014 states generated (29,109,516 s/min), 1,567,034,954 distinct states found (1,897,586 ds/min), 191,229,524 states left on queue.
+Progress(44) at 2024-11-07 02:39:05: 19,548,204,678 states generated (29,051,664 s/min), 1,568,989,443 distinct states found (1,954,489 ds/min), 191,191,752 states left on queue.
+Progress(44) at 2024-11-07 02:40:05: 19,577,227,470 states generated (29,022,792 s/min), 1,570,981,495 distinct states found (1,992,052 ds/min), 191,200,024 states left on queue.
+Progress(44) at 2024-11-07 02:41:05: 19,606,172,601 states generated (28,945,131 s/min), 1,572,870,324 distinct states found (1,888,829 ds/min), 191,115,956 states left on queue.
+Progress(44) at 2024-11-07 02:42:05: 19,635,167,481 states generated (28,994,880 s/min), 1,574,894,468 distinct states found (2,024,144 ds/min), 191,139,869 states left on queue.
+Progress(44) at 2024-11-07 02:43:05: 19,664,339,049 states generated (29,171,568 s/min), 1,576,906,348 distinct states found (2,011,880 ds/min), 191,137,521 states left on queue.
+Progress(44) at 2024-11-07 02:44:05: 19,693,639,689 states generated (29,300,640 s/min), 1,578,748,425 distinct states found (1,842,077 ds/min), 191,040,518 states left on queue.
+Progress(44) at 2024-11-07 02:45:05: 19,722,704,536 states generated (29,064,847 s/min), 1,580,671,538 distinct states found (1,923,113 ds/min), 191,001,469 states left on queue.
+Progress(44) at 2024-11-07 02:46:05: 19,751,627,669 states generated (28,923,133 s/min), 1,582,340,762 distinct states found (1,669,224 ds/min), 190,750,504 states left on queue.
+Progress(44) at 2024-11-07 02:47:05: 19,780,532,535 states generated (28,904,866 s/min), 1,583,965,049 distinct states found (1,624,287 ds/min), 190,492,540 states left on queue.
+Progress(44) at 2024-11-07 02:48:05: 19,809,548,743 states generated (29,016,208 s/min), 1,585,820,774 distinct states found (1,855,725 ds/min), 190,422,454 states left on queue.
+Progress(44) at 2024-11-07 02:49:05: 19,838,541,075 states generated (28,992,332 s/min), 1,587,731,649 distinct states found (1,910,875 ds/min), 190,386,932 states left on queue.
+Progress(44) at 2024-11-07 02:50:05: 19,867,458,320 states generated (28,917,245 s/min), 1,589,622,141 distinct states found (1,890,492 ds/min), 190,310,460 states left on queue.
+Progress(44) at 2024-11-07 02:51:05: 19,896,287,158 states generated (28,828,838 s/min), 1,591,517,151 distinct states found (1,895,010 ds/min), 190,235,561 states left on queue.
+Progress(44) at 2024-11-07 02:52:05: 19,925,117,820 states generated (28,830,662 s/min), 1,593,453,289 distinct states found (1,936,138 ds/min), 190,176,789 states left on queue.
+Progress(44) at 2024-11-07 02:53:05: 19,953,949,651 states generated (28,831,831 s/min), 1,595,392,832 distinct states found (1,939,543 ds/min), 190,137,713 states left on queue.
+Progress(44) at 2024-11-07 02:54:05: 19,982,791,590 states generated (28,841,939 s/min), 1,597,295,182 distinct states found (1,902,350 ds/min), 190,030,864 states left on queue.
+Progress(44) at 2024-11-07 02:55:05: 20,011,631,796 states generated (28,840,206 s/min), 1,599,162,388 distinct states found (1,867,206 ds/min), 189,857,155 states left on queue.
+Progress(44) at 2024-11-07 02:56:05: 20,040,350,017 states generated (28,718,221 s/min), 1,600,882,747 distinct states found (1,720,359 ds/min), 189,556,504 states left on queue.
+Progress(44) at 2024-11-07 02:57:05: 20,069,048,267 states generated (28,698,250 s/min), 1,602,583,945 distinct states found (1,701,198 ds/min), 189,276,085 states left on queue.
+Progress(44) at 2024-11-07 02:58:05: 20,098,037,079 states generated (28,988,812 s/min), 1,604,245,937 distinct states found (1,661,992 ds/min), 188,968,070 states left on queue.
+Progress(44) at 2024-11-07 02:59:05: 20,127,216,730 states generated (29,179,651 s/min), 1,605,916,753 distinct states found (1,670,816 ds/min), 188,703,437 states left on queue.
+Progress(44) at 2024-11-07 03:00:05: 20,156,712,917 states generated (29,496,187 s/min), 1,607,868,866 distinct states found (1,952,113 ds/min), 188,640,553 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 03:01:06)
+Progress(44) at 2024-11-07 03:01:06: 20,186,396,044 states generated (29,683,127 s/min), 1,609,765,772 distinct states found (1,896,906 ds/min), 188,510,848 states left on queue.
+Progress(44) at 2024-11-07 03:02:06: 20,215,754,864 states generated (29,358,820 s/min), 1,611,924,139 distinct states found (2,158,367 ds/min), 188,607,723 states left on queue.
+Progress(44) at 2024-11-07 03:03:06: 20,245,041,982 states generated (29,287,118 s/min), 1,613,794,702 distinct states found (1,870,563 ds/min), 188,472,700 states left on queue.
+Progress(44) at 2024-11-07 03:04:06: 20,274,294,374 states generated (29,252,392 s/min), 1,615,566,733 distinct states found (1,772,031 ds/min), 188,311,061 states left on queue.
+Progress(44) at 2024-11-07 03:05:06: 20,303,317,537 states generated (29,023,163 s/min), 1,617,541,966 distinct states found (1,975,233 ds/min), 188,275,227 states left on queue.
+Progress(44) at 2024-11-07 03:06:06: 20,332,555,917 states generated (29,238,380 s/min), 1,619,626,477 distinct states found (2,084,511 ds/min), 188,311,129 states left on queue.
+Progress(44) at 2024-11-07 03:07:06: 20,361,814,948 states generated (29,259,031 s/min), 1,621,498,944 distinct states found (1,872,467 ds/min), 188,187,982 states left on queue.
+Progress(44) at 2024-11-07 03:08:06: 20,391,066,062 states generated (29,251,114 s/min), 1,623,499,145 distinct states found (2,000,201 ds/min), 188,184,372 states left on queue.
+Progress(44) at 2024-11-07 03:09:06: 20,420,013,539 states generated (28,947,477 s/min), 1,625,534,256 distinct states found (2,035,111 ds/min), 188,202,174 states left on queue.
+Progress(44) at 2024-11-07 03:10:06: 20,449,116,787 states generated (29,103,248 s/min), 1,627,670,135 distinct states found (2,135,879 ds/min), 188,303,061 states left on queue.
+Progress(44) at 2024-11-07 03:11:06: 20,478,265,224 states generated (29,148,437 s/min), 1,629,558,947 distinct states found (1,888,812 ds/min), 188,171,995 states left on queue.
+Progress(44) at 2024-11-07 03:12:06: 20,507,459,785 states generated (29,194,561 s/min), 1,631,460,915 distinct states found (1,901,968 ds/min), 188,044,516 states left on queue.
+Progress(44) at 2024-11-07 03:13:06: 20,536,655,025 states generated (29,195,240 s/min), 1,633,292,515 distinct states found (1,831,600 ds/min), 187,823,678 states left on queue.
+Progress(44) at 2024-11-07 03:14:06: 20,565,699,198 states generated (29,044,173 s/min), 1,634,967,122 distinct states found (1,674,607 ds/min), 187,564,357 states left on queue.
+Progress(44) at 2024-11-07 03:15:06: 20,594,568,781 states generated (28,869,583 s/min), 1,636,996,440 distinct states found (2,029,318 ds/min), 187,577,506 states left on queue.
+Progress(44) at 2024-11-07 03:16:06: 20,623,463,526 states generated (28,894,745 s/min), 1,638,870,718 distinct states found (1,874,278 ds/min), 187,429,057 states left on queue.
+Progress(44) at 2024-11-07 03:17:06: 20,652,517,975 states generated (29,054,449 s/min), 1,640,608,054 distinct states found (1,737,336 ds/min), 187,198,996 states left on queue.
+Progress(44) at 2024-11-07 03:18:06: 20,681,729,377 states generated (29,211,402 s/min), 1,642,682,611 distinct states found (2,074,557 ds/min), 187,238,673 states left on queue.
+Progress(44) at 2024-11-07 03:19:06: 20,711,226,363 states generated (29,496,986 s/min), 1,644,764,480 distinct states found (2,081,869 ds/min), 187,269,746 states left on queue.
+Progress(44) at 2024-11-07 03:20:06: 20,740,520,876 states generated (29,294,513 s/min), 1,646,565,948 distinct states found (1,801,468 ds/min), 187,085,841 states left on queue.
+Progress(44) at 2024-11-07 03:21:06: 20,769,532,066 states generated (29,011,190 s/min), 1,648,139,570 distinct states found (1,573,622 ds/min), 186,737,971 states left on queue.
+Progress(44) at 2024-11-07 03:22:06: 20,798,731,555 states generated (29,199,489 s/min), 1,650,061,318 distinct states found (1,921,748 ds/min), 186,652,080 states left on queue.
+Progress(44) at 2024-11-07 03:23:06: 20,827,864,871 states generated (29,133,316 s/min), 1,652,217,368 distinct states found (2,156,050 ds/min), 186,786,338 states left on queue.
+Progress(44) at 2024-11-07 03:24:06: 20,857,114,542 states generated (29,249,671 s/min), 1,654,404,059 distinct states found (2,186,691 ds/min), 186,937,127 states left on queue.
+Progress(44) at 2024-11-07 03:25:06: 20,886,216,235 states generated (29,101,693 s/min), 1,656,424,687 distinct states found (2,020,628 ds/min), 186,925,384 states left on queue.
+Progress(44) at 2024-11-07 03:26:06: 20,915,415,138 states generated (29,198,903 s/min), 1,658,503,968 distinct states found (2,079,281 ds/min), 186,988,200 states left on queue.
+Progress(44) at 2024-11-07 03:27:06: 20,944,436,117 states generated (29,020,979 s/min), 1,660,708,925 distinct states found (2,204,957 ds/min), 187,151,771 states left on queue.
+Progress(44) at 2024-11-07 03:28:06: 20,973,637,986 states generated (29,201,869 s/min), 1,662,812,161 distinct states found (2,103,236 ds/min), 187,208,363 states left on queue.
+Progress(44) at 2024-11-07 03:29:06: 21,002,664,654 states generated (29,026,668 s/min), 1,665,077,078 distinct states found (2,264,917 ds/min), 187,398,168 states left on queue.
+Progress(44) at 2024-11-07 03:30:06: 21,031,900,683 states generated (29,236,029 s/min), 1,667,241,517 distinct states found (2,164,439 ds/min), 187,444,342 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 03:31:07)
+Progress(44) at 2024-11-07 03:31:07: 21,061,190,967 states generated (29,290,284 s/min), 1,669,337,346 distinct states found (2,095,829 ds/min), 187,431,388 states left on queue.
+Progress(44) at 2024-11-07 03:32:07: 21,090,368,622 states generated (29,177,655 s/min), 1,671,292,120 distinct states found (1,954,774 ds/min), 187,370,395 states left on queue.
+Progress(44) at 2024-11-07 03:33:07: 21,119,546,588 states generated (29,177,966 s/min), 1,673,505,061 distinct states found (2,212,941 ds/min), 187,548,275 states left on queue.
+Progress(44) at 2024-11-07 03:34:07: 21,148,770,544 states generated (29,223,956 s/min), 1,675,679,331 distinct states found (2,174,270 ds/min), 187,681,477 states left on queue.
+Progress(44) at 2024-11-07 03:35:07: 21,177,752,842 states generated (28,982,298 s/min), 1,677,512,003 distinct states found (1,832,672 ds/min), 187,502,663 states left on queue.
+Progress(44) at 2024-11-07 03:36:07: 21,206,777,989 states generated (29,025,147 s/min), 1,679,391,120 distinct states found (1,879,117 ds/min), 187,345,876 states left on queue.
+Progress(44) at 2024-11-07 03:37:07: 21,235,634,562 states generated (28,856,573 s/min), 1,681,551,461 distinct states found (2,160,341 ds/min), 187,464,887 states left on queue.
+Progress(44) at 2024-11-07 03:38:07: 21,264,448,690 states generated (28,814,128 s/min), 1,683,690,836 distinct states found (2,139,375 ds/min), 187,491,922 states left on queue.
+Progress(44) at 2024-11-07 03:39:07: 21,293,469,454 states generated (29,020,764 s/min), 1,685,615,643 distinct states found (1,924,807 ds/min), 187,416,199 states left on queue.
+Progress(44) at 2024-11-07 03:40:07: 21,322,287,082 states generated (28,817,628 s/min), 1,687,574,723 distinct states found (1,959,080 ds/min), 187,310,981 states left on queue.
+Progress(44) at 2024-11-07 03:41:07: 21,351,396,680 states generated (29,109,598 s/min), 1,689,546,445 distinct states found (1,971,722 ds/min), 187,236,923 states left on queue.
+Progress(44) at 2024-11-07 03:42:07: 21,380,557,165 states generated (29,160,485 s/min), 1,691,587,169 distinct states found (2,040,724 ds/min), 187,186,480 states left on queue.
+Progress(44) at 2024-11-07 03:43:07: 21,409,627,333 states generated (29,070,168 s/min), 1,693,246,645 distinct states found (1,659,476 ds/min), 186,839,410 states left on queue.
+Progress(44) at 2024-11-07 03:44:07: 21,438,692,500 states generated (29,065,167 s/min), 1,695,162,088 distinct states found (1,915,443 ds/min), 186,763,843 states left on queue.
+Progress(44) at 2024-11-07 03:45:07: 21,467,558,980 states generated (28,866,480 s/min), 1,697,105,328 distinct states found (1,943,240 ds/min), 186,647,091 states left on queue.
+Progress(44) at 2024-11-07 03:46:07: 21,496,459,596 states generated (28,900,616 s/min), 1,698,987,134 distinct states found (1,881,806 ds/min), 186,428,411 states left on queue.
+Progress(44) at 2024-11-07 03:47:07: 21,525,539,564 states generated (29,079,968 s/min), 1,700,685,335 distinct states found (1,698,201 ds/min), 186,176,831 states left on queue.
+Progress(44) at 2024-11-07 03:48:07: 21,554,716,115 states generated (29,176,551 s/min), 1,702,193,633 distinct states found (1,508,298 ds/min), 185,811,852 states left on queue.
+Progress(44) at 2024-11-07 03:49:07: 21,583,930,332 states generated (29,214,217 s/min), 1,703,965,186 distinct states found (1,771,553 ds/min), 185,645,122 states left on queue.
+Progress(44) at 2024-11-07 03:50:07: 21,612,870,304 states generated (28,939,972 s/min), 1,705,581,017 distinct states found (1,615,831 ds/min), 185,385,482 states left on queue.
+Progress(44) at 2024-11-07 03:51:07: 21,641,828,993 states generated (28,958,689 s/min), 1,707,209,695 distinct states found (1,628,678 ds/min), 185,147,878 states left on queue.
+Progress(44) at 2024-11-07 03:52:07: 21,670,879,227 states generated (29,050,234 s/min), 1,708,891,056 distinct states found (1,681,361 ds/min), 184,967,950 states left on queue.
+Progress(44) at 2024-11-07 03:53:07: 21,700,175,853 states generated (29,296,626 s/min), 1,710,442,845 distinct states found (1,551,789 ds/min), 184,628,950 states left on queue.
+Progress(44) at 2024-11-07 03:54:07: 21,729,661,920 states generated (29,486,067 s/min), 1,712,360,375 distinct states found (1,917,530 ds/min), 184,602,047 states left on queue.
+Progress(44) at 2024-11-07 03:55:07: 21,759,015,470 states generated (29,353,550 s/min), 1,714,259,170 distinct states found (1,898,795 ds/min), 184,554,564 states left on queue.
+Progress(44) at 2024-11-07 03:56:07: 21,788,534,088 states generated (29,518,618 s/min), 1,716,081,999 distinct states found (1,822,829 ds/min), 184,406,994 states left on queue.
+Progress(44) at 2024-11-07 03:57:07: 21,817,875,474 states generated (29,341,386 s/min), 1,717,634,611 distinct states found (1,552,612 ds/min), 184,057,660 states left on queue.
+Progress(44) at 2024-11-07 03:58:07: 21,847,006,510 states generated (29,131,036 s/min), 1,719,299,741 distinct states found (1,665,130 ds/min), 183,828,258 states left on queue.
+Progress(44) at 2024-11-07 03:59:07: 21,875,869,357 states generated (28,862,847 s/min), 1,720,801,722 distinct states found (1,501,981 ds/min), 183,443,083 states left on queue.
+Progress(44) at 2024-11-07 04:00:07: 21,904,922,732 states generated (29,053,375 s/min), 1,722,588,504 distinct states found (1,786,782 ds/min), 183,289,094 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 04:01:07)
+Progress(44) at 2024-11-07 04:01:07: 21,933,965,695 states generated (29,042,963 s/min), 1,724,285,279 distinct states found (1,696,775 ds/min), 183,029,310 states left on queue.
+Progress(44) at 2024-11-07 04:02:07: 21,962,959,341 states generated (28,993,646 s/min), 1,725,868,213 distinct states found (1,582,934 ds/min), 182,699,155 states left on queue.
+Progress(44) at 2024-11-07 04:03:07: 21,991,777,816 states generated (28,818,475 s/min), 1,727,519,032 distinct states found (1,650,819 ds/min), 182,433,701 states left on queue.
+Progress(44) at 2024-11-07 04:04:07: 22,020,733,433 states generated (28,955,617 s/min), 1,729,219,503 distinct states found (1,700,471 ds/min), 182,216,615 states left on queue.
+Progress(44) at 2024-11-07 04:05:07: 22,049,984,634 states generated (29,251,201 s/min), 1,730,967,606 distinct states found (1,748,103 ds/min), 182,140,987 states left on queue.
+Progress(44) at 2024-11-07 04:06:07: 22,079,112,674 states generated (29,128,040 s/min), 1,732,648,368 distinct states found (1,680,762 ds/min), 181,963,576 states left on queue.
+Progress(44) at 2024-11-07 04:07:07: 22,108,329,917 states generated (29,217,243 s/min), 1,734,711,160 distinct states found (2,062,792 ds/min), 182,074,434 states left on queue.
+Progress(44) at 2024-11-07 04:08:07: 22,137,402,322 states generated (29,072,405 s/min), 1,736,773,111 distinct states found (2,061,951 ds/min), 182,163,318 states left on queue.
+Progress(44) at 2024-11-07 04:09:07: 22,166,402,243 states generated (28,999,921 s/min), 1,738,573,615 distinct states found (1,800,504 ds/min), 182,034,194 states left on queue.
+Progress(44) at 2024-11-07 04:10:07: 22,195,545,763 states generated (29,143,520 s/min), 1,740,349,901 distinct states found (1,776,286 ds/min), 181,869,339 states left on queue.
+Progress(44) at 2024-11-07 04:11:07: 22,224,766,309 states generated (29,220,546 s/min), 1,742,110,577 distinct states found (1,760,676 ds/min), 181,671,885 states left on queue.
+Progress(44) at 2024-11-07 04:12:07: 22,253,807,692 states generated (29,041,383 s/min), 1,743,796,752 distinct states found (1,686,175 ds/min), 181,407,584 states left on queue.
+Progress(44) at 2024-11-07 04:13:07: 22,282,790,947 states generated (28,983,255 s/min), 1,745,617,175 distinct states found (1,820,423 ds/min), 181,265,096 states left on queue.
+Progress(44) at 2024-11-07 04:14:07: 22,311,840,917 states generated (29,049,970 s/min), 1,747,424,658 distinct states found (1,807,483 ds/min), 181,110,335 states left on queue.
+Progress(44) at 2024-11-07 04:15:07: 22,340,851,116 states generated (29,010,199 s/min), 1,749,204,899 distinct states found (1,780,241 ds/min), 180,933,264 states left on queue.
+Progress(44) at 2024-11-07 04:16:07: 22,369,820,191 states generated (28,969,075 s/min), 1,751,058,290 distinct states found (1,853,391 ds/min), 180,819,450 states left on queue.
+Progress(44) at 2024-11-07 04:17:07: 22,398,637,854 states generated (28,817,663 s/min), 1,752,838,012 distinct states found (1,779,722 ds/min), 180,641,066 states left on queue.
+Progress(44) at 2024-11-07 04:18:07: 22,427,736,775 states generated (29,098,921 s/min), 1,754,678,716 distinct states found (1,840,704 ds/min), 180,523,907 states left on queue.
+Progress(44) at 2024-11-07 04:19:07: 22,456,749,604 states generated (29,012,829 s/min), 1,756,653,204 distinct states found (1,974,488 ds/min), 180,502,441 states left on queue.
+Progress(44) at 2024-11-07 04:20:07: 22,485,995,309 states generated (29,245,705 s/min), 1,758,406,219 distinct states found (1,753,015 ds/min), 180,303,710 states left on queue.
+Progress(44) at 2024-11-07 04:21:07: 22,515,059,607 states generated (29,064,298 s/min), 1,760,239,858 distinct states found (1,833,639 ds/min), 180,203,277 states left on queue.
+Progress(44) at 2024-11-07 04:22:07: 22,544,007,885 states generated (28,948,278 s/min), 1,761,871,023 distinct states found (1,631,165 ds/min), 179,919,396 states left on queue.
+Progress(44) at 2024-11-07 04:23:07: 22,572,858,704 states generated (28,850,819 s/min), 1,763,420,170 distinct states found (1,549,147 ds/min), 179,579,696 states left on queue.
+Progress(44) at 2024-11-07 04:24:07: 22,601,850,297 states generated (28,991,593 s/min), 1,765,118,103 distinct states found (1,697,933 ds/min), 179,386,571 states left on queue.
+Progress(44) at 2024-11-07 04:25:07: 22,630,832,111 states generated (28,981,814 s/min), 1,766,934,802 distinct states found (1,816,699 ds/min), 179,271,264 states left on queue.
+Progress(44) at 2024-11-07 04:26:07: 22,659,674,047 states generated (28,841,936 s/min), 1,768,697,425 distinct states found (1,762,623 ds/min), 179,093,059 states left on queue.
+Progress(44) at 2024-11-07 04:27:07: 22,688,427,580 states generated (28,753,533 s/min), 1,770,450,184 distinct states found (1,752,759 ds/min), 178,899,489 states left on queue.
+Progress(44) at 2024-11-07 04:28:07: 22,717,189,869 states generated (28,762,289 s/min), 1,772,256,239 distinct states found (1,806,055 ds/min), 178,731,640 states left on queue.
+Progress(44) at 2024-11-07 04:29:07: 22,746,022,343 states generated (28,832,474 s/min), 1,774,044,050 distinct states found (1,787,811 ds/min), 178,570,129 states left on queue.
+Progress(44) at 2024-11-07 04:30:07: 22,774,887,995 states generated (28,865,652 s/min), 1,775,840,059 distinct states found (1,796,009 ds/min), 178,368,886 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 04:31:08)
+Progress(44) at 2024-11-07 04:31:08: 22,803,877,345 states generated (28,989,350 s/min), 1,777,539,486 distinct states found (1,699,427 ds/min), 178,036,764 states left on queue.
+Progress(44) at 2024-11-07 04:32:08: 22,832,344,161 states generated (28,466,816 s/min), 1,779,071,939 distinct states found (1,532,453 ds/min), 177,609,496 states left on queue.
+Progress(44) at 2024-11-07 04:33:08: 22,860,965,708 states generated (28,621,547 s/min), 1,780,632,191 distinct states found (1,560,252 ds/min), 177,226,645 states left on queue.
+Progress(44) at 2024-11-07 04:34:08: 22,890,116,212 states generated (29,150,504 s/min), 1,782,192,671 distinct states found (1,560,480 ds/min), 176,856,967 states left on queue.
+Progress(44) at 2024-11-07 04:35:08: 22,919,394,798 states generated (29,278,586 s/min), 1,783,989,997 distinct states found (1,797,326 ds/min), 176,677,020 states left on queue.
+Progress(44) at 2024-11-07 04:36:08: 22,948,717,272 states generated (29,322,474 s/min), 1,785,769,628 distinct states found (1,779,631 ds/min), 176,466,304 states left on queue.
+Progress(44) at 2024-11-07 04:37:08: 22,978,008,874 states generated (29,291,602 s/min), 1,787,768,546 distinct states found (1,998,918 ds/min), 176,439,057 states left on queue.
+Progress(45) at 2024-11-07 04:38:08: 23,007,259,342 states generated (29,250,468 s/min), 1,789,652,180 distinct states found (1,883,634 ds/min), 176,335,868 states left on queue.
+Progress(45) at 2024-11-07 04:39:08: 23,036,414,234 states generated (29,154,892 s/min), 1,791,293,395 distinct states found (1,641,215 ds/min), 176,048,834 states left on queue.
+Progress(45) at 2024-11-07 04:40:08: 23,065,467,218 states generated (29,052,984 s/min), 1,793,176,180 distinct states found (1,882,785 ds/min), 175,945,068 states left on queue.
+Progress(45) at 2024-11-07 04:41:08: 23,094,601,413 states generated (29,134,195 s/min), 1,795,085,201 distinct states found (1,909,021 ds/min), 175,844,471 states left on queue.
+Progress(45) at 2024-11-07 04:42:08: 23,123,835,299 states generated (29,233,886 s/min), 1,796,998,629 distinct states found (1,913,428 ds/min), 175,751,026 states left on queue.
+Progress(45) at 2024-11-07 04:43:08: 23,153,014,383 states generated (29,179,084 s/min), 1,798,830,917 distinct states found (1,832,288 ds/min), 175,609,899 states left on queue.
+Progress(45) at 2024-11-07 04:44:08: 23,181,848,791 states generated (28,834,408 s/min), 1,800,688,969 distinct states found (1,858,052 ds/min), 175,482,089 states left on queue.
+Progress(45) at 2024-11-07 04:45:08: 23,210,960,242 states generated (29,111,451 s/min), 1,802,681,838 distinct states found (1,992,869 ds/min), 175,468,259 states left on queue.
+Progress(45) at 2024-11-07 04:46:08: 23,239,931,898 states generated (28,971,656 s/min), 1,804,527,297 distinct states found (1,845,459 ds/min), 175,314,676 states left on queue.
+Progress(45) at 2024-11-07 04:47:08: 23,269,110,236 states generated (29,178,338 s/min), 1,806,324,412 distinct states found (1,797,115 ds/min), 175,104,294 states left on queue.
+Progress(45) at 2024-11-07 04:48:08: 23,298,261,893 states generated (29,151,657 s/min), 1,808,026,372 distinct states found (1,701,960 ds/min), 174,789,761 states left on queue.
+Progress(45) at 2024-11-07 04:49:08: 23,327,194,301 states generated (28,932,408 s/min), 1,809,635,143 distinct states found (1,608,771 ds/min), 174,475,327 states left on queue.
+Progress(45) at 2024-11-07 04:50:08: 23,356,033,807 states generated (28,839,506 s/min), 1,811,533,685 distinct states found (1,898,542 ds/min), 174,375,697 states left on queue.
+Progress(45) at 2024-11-07 04:51:08: 23,384,783,950 states generated (28,750,143 s/min), 1,813,242,773 distinct states found (1,709,088 ds/min), 174,093,638 states left on queue.
+Progress(45) at 2024-11-07 04:52:08: 23,413,868,078 states generated (29,084,128 s/min), 1,814,921,217 distinct states found (1,678,444 ds/min), 173,816,375 states left on queue.
+Progress(45) at 2024-11-07 04:53:08: 23,443,072,326 states generated (29,204,248 s/min), 1,816,887,463 distinct states found (1,966,246 ds/min), 173,768,064 states left on queue.
+Progress(45) at 2024-11-07 04:54:08: 23,472,531,302 states generated (29,458,976 s/min), 1,818,893,389 distinct states found (2,005,926 ds/min), 173,736,986 states left on queue.
+Progress(45) at 2024-11-07 04:55:08: 23,501,670,169 states generated (29,138,867 s/min), 1,820,467,013 distinct states found (1,573,624 ds/min), 173,393,980 states left on queue.
+Progress(45) at 2024-11-07 04:56:08: 23,530,619,816 states generated (28,949,647 s/min), 1,822,153,389 distinct states found (1,686,376 ds/min), 173,102,476 states left on queue.
+Progress(45) at 2024-11-07 04:57:08: 23,559,730,839 states generated (29,111,023 s/min), 1,824,067,840 distinct states found (1,914,451 ds/min), 173,045,910 states left on queue.
+Progress(45) at 2024-11-07 04:58:08: 23,588,956,543 states generated (29,225,704 s/min), 1,826,128,132 distinct states found (2,060,292 ds/min), 173,097,456 states left on queue.
+Progress(45) at 2024-11-07 04:59:08: 23,617,943,385 states generated (28,986,842 s/min), 1,828,156,857 distinct states found (2,028,725 ds/min), 173,115,797 states left on queue.
+Progress(45) at 2024-11-07 05:00:08: 23,647,052,247 states generated (29,108,862 s/min), 1,830,116,296 distinct states found (1,959,439 ds/min), 173,061,677 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 05:01:09)
+Progress(45) at 2024-11-07 05:01:09: 23,676,540,644 states generated (29,488,397 s/min), 1,832,081,172 distinct states found (1,964,876 ds/min), 173,019,523 states left on queue.
+Progress(45) at 2024-11-07 05:02:09: 23,705,447,239 states generated (28,906,595 s/min), 1,834,157,962 distinct states found (2,076,790 ds/min), 173,069,444 states left on queue.
+Progress(45) at 2024-11-07 05:03:09: 23,734,590,381 states generated (29,143,142 s/min), 1,836,148,599 distinct states found (1,990,637 ds/min), 173,037,041 states left on queue.
+Progress(45) at 2024-11-07 05:04:09: 23,763,605,229 states generated (29,014,848 s/min), 1,838,302,051 distinct states found (2,153,452 ds/min), 173,135,339 states left on queue.
+Progress(45) at 2024-11-07 05:05:09: 23,792,794,847 states generated (29,189,618 s/min), 1,840,318,078 distinct states found (2,016,027 ds/min), 173,064,676 states left on queue.
+Progress(45) at 2024-11-07 05:06:09: 23,821,711,411 states generated (28,916,564 s/min), 1,842,248,819 distinct states found (1,930,741 ds/min), 172,938,116 states left on queue.
+Progress(45) at 2024-11-07 05:07:09: 23,850,829,522 states generated (29,118,111 s/min), 1,844,084,520 distinct states found (1,835,701 ds/min), 172,779,569 states left on queue.
+Progress(45) at 2024-11-07 05:08:09: 23,880,027,055 states generated (29,197,533 s/min), 1,846,207,907 distinct states found (2,123,387 ds/min), 172,876,875 states left on queue.
+Progress(45) at 2024-11-07 05:09:09: 23,909,238,654 states generated (29,211,599 s/min), 1,848,275,162 distinct states found (2,067,255 ds/min), 172,917,710 states left on queue.
+Progress(45) at 2024-11-07 05:10:09: 23,938,254,527 states generated (29,015,873 s/min), 1,850,062,508 distinct states found (1,787,346 ds/min), 172,709,939 states left on queue.
+Progress(45) at 2024-11-07 05:11:09: 23,967,280,908 states generated (29,026,381 s/min), 1,851,840,844 distinct states found (1,778,336 ds/min), 172,472,809 states left on queue.
+Progress(45) at 2024-11-07 05:12:09: 23,996,137,153 states generated (28,856,245 s/min), 1,853,907,711 distinct states found (2,066,867 ds/min), 172,514,422 states left on queue.
+Progress(45) at 2024-11-07 05:13:09: 24,025,003,271 states generated (28,866,118 s/min), 1,855,881,596 distinct states found (1,973,885 ds/min), 172,410,581 states left on queue.
+Progress(45) at 2024-11-07 05:14:09: 24,053,998,968 states generated (28,995,697 s/min), 1,857,730,142 distinct states found (1,848,546 ds/min), 172,259,468 states left on queue.
+Progress(45) at 2024-11-07 05:15:09: 24,082,780,775 states generated (28,781,807 s/min), 1,859,612,879 distinct states found (1,882,737 ds/min), 172,097,889 states left on queue.
+Progress(45) at 2024-11-07 05:16:09: 24,111,843,462 states generated (29,062,687 s/min), 1,861,479,353 distinct states found (1,866,474 ds/min), 171,938,834 states left on queue.
+Progress(45) at 2024-11-07 05:17:09: 24,140,987,153 states generated (29,143,691 s/min), 1,863,390,493 distinct states found (1,911,140 ds/min), 171,786,752 states left on queue.
+Progress(45) at 2024-11-07 05:18:09: 24,170,023,897 states generated (29,036,744 s/min), 1,864,965,603 distinct states found (1,575,110 ds/min), 171,386,848 states left on queue.
+Progress(45) at 2024-11-07 05:19:09: 24,198,987,772 states generated (28,963,875 s/min), 1,866,820,638 distinct states found (1,855,035 ds/min), 171,238,575 states left on queue.
+Progress(45) at 2024-11-07 05:20:09: 24,227,820,740 states generated (28,832,968 s/min), 1,868,623,853 distinct states found (1,803,215 ds/min), 171,005,974 states left on queue.
+Progress(45) at 2024-11-07 05:21:09: 24,256,712,636 states generated (28,891,896 s/min), 1,870,265,139 distinct states found (1,641,286 ds/min), 170,619,838 states left on queue.
+Progress(45) at 2024-11-07 05:22:09: 24,285,792,587 states generated (29,079,951 s/min), 1,871,770,548 distinct states found (1,505,409 ds/min), 170,247,019 states left on queue.
+Progress(45) at 2024-11-07 05:23:09: 24,315,021,618 states generated (29,229,031 s/min), 1,873,433,426 distinct states found (1,662,878 ds/min), 169,986,497 states left on queue.
+Progress(45) at 2024-11-07 05:24:09: 24,343,972,976 states generated (28,951,358 s/min), 1,874,958,509 distinct states found (1,525,083 ds/min), 169,639,357 states left on queue.
+Progress(45) at 2024-11-07 05:25:09: 24,372,818,044 states generated (28,845,068 s/min), 1,876,461,909 distinct states found (1,503,400 ds/min), 169,298,313 states left on queue.
+Progress(45) at 2024-11-07 05:26:09: 24,401,879,839 states generated (29,061,795 s/min), 1,878,043,093 distinct states found (1,581,184 ds/min), 169,034,999 states left on queue.
+Progress(45) at 2024-11-07 05:27:09: 24,431,117,440 states generated (29,237,601 s/min), 1,879,528,913 distinct states found (1,485,820 ds/min), 168,669,766 states left on queue.
+Progress(45) at 2024-11-07 05:28:09: 24,460,565,564 states generated (29,448,124 s/min), 1,881,382,841 distinct states found (1,853,928 ds/min), 168,585,549 states left on queue.
+Progress(45) at 2024-11-07 05:29:09: 24,489,842,320 states generated (29,276,756 s/min), 1,883,163,526 distinct states found (1,780,685 ds/min), 168,440,866 states left on queue.
+Progress(45) at 2024-11-07 05:30:09: 24,519,309,785 states generated (29,467,465 s/min), 1,884,840,978 distinct states found (1,677,452 ds/min), 168,176,100 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 05:31:09)
+Progress(45) at 2024-11-07 05:31:09: 24,548,699,426 states generated (29,389,641 s/min), 1,886,346,733 distinct states found (1,505,755 ds/min), 167,794,030 states left on queue.
+Progress(45) at 2024-11-07 05:32:09: 24,577,454,860 states generated (28,755,434 s/min), 1,887,761,288 distinct states found (1,414,555 ds/min), 167,342,409 states left on queue.
+Progress(45) at 2024-11-07 05:33:09: 24,606,401,929 states generated (28,947,069 s/min), 1,889,451,503 distinct states found (1,690,215 ds/min), 167,115,718 states left on queue.
+Progress(45) at 2024-11-07 05:34:09: 24,635,080,181 states generated (28,678,252 s/min), 1,891,013,080 distinct states found (1,561,577 ds/min), 166,760,395 states left on queue.
+Progress(45) at 2024-11-07 05:35:09: 24,663,912,233 states generated (28,832,052 s/min), 1,892,486,967 distinct states found (1,473,887 ds/min), 166,347,547 states left on queue.
+Progress(45) at 2024-11-07 05:36:09: 24,692,601,003 states generated (28,688,770 s/min), 1,894,014,661 distinct states found (1,527,694 ds/min), 165,980,327 states left on queue.
+Progress(45) at 2024-11-07 05:37:09: 24,721,596,280 states generated (28,995,277 s/min), 1,895,667,269 distinct states found (1,652,608 ds/min), 165,766,132 states left on queue.
+Progress(45) at 2024-11-07 05:38:09: 24,750,737,270 states generated (29,140,990 s/min), 1,897,304,588 distinct states found (1,637,319 ds/min), 165,602,331 states left on queue.
+Progress(45) at 2024-11-07 05:39:09: 24,779,762,621 states generated (29,025,351 s/min), 1,898,944,557 distinct states found (1,639,969 ds/min), 165,399,097 states left on queue.
+Progress(45) at 2024-11-07 05:40:09: 24,808,890,636 states generated (29,128,015 s/min), 1,901,039,200 distinct states found (2,094,643 ds/min), 165,505,866 states left on queue.
+Progress(45) at 2024-11-07 05:41:09: 24,837,834,330 states generated (28,943,694 s/min), 1,902,825,947 distinct states found (1,786,747 ds/min), 165,385,690 states left on queue.
+Progress(45) at 2024-11-07 05:42:09: 24,866,749,194 states generated (28,914,864 s/min), 1,904,509,048 distinct states found (1,683,101 ds/min), 165,143,394 states left on queue.
+Progress(45) at 2024-11-07 05:43:09: 24,895,891,462 states generated (29,142,268 s/min), 1,906,186,633 distinct states found (1,677,585 ds/min), 164,907,199 states left on queue.
+Progress(45) at 2024-11-07 05:44:09: 24,924,929,592 states generated (29,038,130 s/min), 1,907,774,010 distinct states found (1,587,377 ds/min), 164,567,256 states left on queue.
+Progress(45) at 2024-11-07 05:45:09: 24,953,854,731 states generated (28,925,139 s/min), 1,909,438,393 distinct states found (1,664,383 ds/min), 164,297,435 states left on queue.
+Progress(45) at 2024-11-07 05:46:09: 24,982,773,173 states generated (28,918,442 s/min), 1,911,115,370 distinct states found (1,676,977 ds/min), 164,029,981 states left on queue.
+Progress(45) at 2024-11-07 05:47:09: 25,011,681,639 states generated (28,908,466 s/min), 1,912,739,102 distinct states found (1,623,732 ds/min), 163,722,709 states left on queue.
+Progress(45) at 2024-11-07 05:48:09: 25,040,624,886 states generated (28,943,247 s/min), 1,914,465,220 distinct states found (1,726,118 ds/min), 163,504,979 states left on queue.
+Progress(45) at 2024-11-07 05:49:09: 25,069,369,631 states generated (28,744,745 s/min), 1,916,123,524 distinct states found (1,658,304 ds/min), 163,227,016 states left on queue.
+Progress(45) at 2024-11-07 05:50:09: 25,098,381,973 states generated (29,012,342 s/min), 1,917,856,454 distinct states found (1,732,930 ds/min), 163,020,213 states left on queue.
+Progress(45) at 2024-11-07 05:51:09: 25,127,432,010 states generated (29,050,037 s/min), 1,919,715,623 distinct states found (1,859,169 ds/min), 162,903,211 states left on queue.
+Progress(45) at 2024-11-07 05:52:09: 25,156,554,852 states generated (29,122,842 s/min), 1,921,381,482 distinct states found (1,665,859 ds/min), 162,640,342 states left on queue.
+Progress(45) at 2024-11-07 05:53:09: 25,185,439,752 states generated (28,884,900 s/min), 1,923,074,493 distinct states found (1,693,011 ds/min), 162,418,419 states left on queue.
+Progress(45) at 2024-11-07 05:54:09: 25,214,250,620 states generated (28,810,868 s/min), 1,924,599,166 distinct states found (1,524,673 ds/min), 162,035,736 states left on queue.
+Progress(45) at 2024-11-07 05:55:09: 25,243,065,684 states generated (28,815,064 s/min), 1,926,028,590 distinct states found (1,429,424 ds/min), 161,647,928 states left on queue.
+Progress(45) at 2024-11-07 05:56:09: 25,272,074,106 states generated (29,008,422 s/min), 1,927,788,924 distinct states found (1,760,334 ds/min), 161,469,066 states left on queue.
+Progress(45) at 2024-11-07 05:57:09: 25,300,916,527 states generated (28,842,421 s/min), 1,929,427,503 distinct states found (1,638,579 ds/min), 161,203,063 states left on queue.
+Progress(45) at 2024-11-07 05:58:09: 25,329,617,957 states generated (28,701,430 s/min), 1,931,016,200 distinct states found (1,588,697 ds/min), 160,883,828 states left on queue.
+Progress(45) at 2024-11-07 05:59:09: 25,358,305,874 states generated (28,687,917 s/min), 1,932,700,683 distinct states found (1,684,483 ds/min), 160,613,534 states left on queue.
+Progress(45) at 2024-11-07 06:00:09: 25,387,060,807 states generated (28,754,933 s/min), 1,934,352,908 distinct states found (1,652,225 ds/min), 160,340,594 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 06:01:10)
+Progress(45) at 2024-11-07 06:01:10: 25,416,167,383 states generated (29,106,576 s/min), 1,936,031,185 distinct states found (1,678,277 ds/min), 160,024,096 states left on queue.
+Progress(45) at 2024-11-07 06:02:10: 25,444,775,068 states generated (28,607,685 s/min), 1,937,531,864 distinct states found (1,500,679 ds/min), 159,558,759 states left on queue.
+Progress(45) at 2024-11-07 06:03:10: 25,473,218,014 states generated (28,442,946 s/min), 1,938,932,593 distinct states found (1,400,729 ds/min), 159,031,186 states left on queue.
+Progress(45) at 2024-11-07 06:04:10: 25,502,153,601 states generated (28,935,587 s/min), 1,940,366,906 distinct states found (1,434,313 ds/min), 158,550,067 states left on queue.
+Progress(45) at 2024-11-07 06:05:10: 25,531,409,924 states generated (29,256,323 s/min), 1,942,031,081 distinct states found (1,664,175 ds/min), 158,260,393 states left on queue.
+Progress(45) at 2024-11-07 06:06:10: 25,560,798,500 states generated (29,388,576 s/min), 1,943,755,697 distinct states found (1,724,616 ds/min), 158,001,838 states left on queue.
+Progress(45) at 2024-11-07 06:07:10: 25,590,101,236 states generated (29,302,736 s/min), 1,945,659,191 distinct states found (1,903,494 ds/min), 157,894,541 states left on queue.
+Progress(45) at 2024-11-07 06:08:10: 25,619,347,006 states generated (29,245,770 s/min), 1,947,436,584 distinct states found (1,777,393 ds/min), 157,703,839 states left on queue.
+Progress(45) at 2024-11-07 06:09:10: 25,648,466,795 states generated (29,119,789 s/min), 1,949,039,117 distinct states found (1,602,533 ds/min), 157,391,298 states left on queue.
+Progress(45) at 2024-11-07 06:10:10: 25,677,360,883 states generated (28,894,088 s/min), 1,950,787,656 distinct states found (1,748,539 ds/min), 157,176,854 states left on queue.
+Progress(45) at 2024-11-07 06:11:10: 25,706,625,655 states generated (29,264,772 s/min), 1,952,700,166 distinct states found (1,912,510 ds/min), 157,069,408 states left on queue.
+Progress(46) at 2024-11-07 06:12:10: 25,735,830,172 states generated (29,204,517 s/min), 1,954,444,069 distinct states found (1,743,903 ds/min), 156,852,227 states left on queue.
+Progress(46) at 2024-11-07 06:13:10: 25,764,811,792 states generated (28,981,620 s/min), 1,956,165,433 distinct states found (1,721,364 ds/min), 156,618,900 states left on queue.
+Progress(46) at 2024-11-07 06:14:10: 25,793,740,486 states generated (28,928,694 s/min), 1,957,961,862 distinct states found (1,796,429 ds/min), 156,441,787 states left on queue.
+Progress(46) at 2024-11-07 06:15:10: 25,822,741,831 states generated (29,001,345 s/min), 1,959,749,416 distinct states found (1,787,554 ds/min), 156,253,838 states left on queue.
+Progress(46) at 2024-11-07 06:16:10: 25,851,804,688 states generated (29,062,857 s/min), 1,961,466,422 distinct states found (1,717,006 ds/min), 155,977,351 states left on queue.
+Progress(46) at 2024-11-07 06:17:10: 25,880,868,584 states generated (29,063,896 s/min), 1,963,090,742 distinct states found (1,624,320 ds/min), 155,628,145 states left on queue.
+Progress(46) at 2024-11-07 06:18:10: 25,909,824,307 states generated (28,955,723 s/min), 1,964,570,100 distinct states found (1,479,358 ds/min), 155,182,107 states left on queue.
+Progress(46) at 2024-11-07 06:19:10: 25,938,584,425 states generated (28,760,118 s/min), 1,966,303,642 distinct states found (1,733,542 ds/min), 154,946,766 states left on queue.
+Progress(46) at 2024-11-07 06:20:10: 25,967,304,223 states generated (28,719,798 s/min), 1,967,883,207 distinct states found (1,579,565 ds/min), 154,558,935 states left on queue.
+Progress(46) at 2024-11-07 06:21:10: 25,996,402,469 states generated (29,098,246 s/min), 1,969,591,000 distinct states found (1,707,793 ds/min), 154,302,069 states left on queue.
+Progress(46) at 2024-11-07 06:22:10: 26,025,623,943 states generated (29,221,474 s/min), 1,971,434,403 distinct states found (1,843,403 ds/min), 154,157,059 states left on queue.
+Progress(46) at 2024-11-07 06:23:10: 26,055,038,054 states generated (29,414,111 s/min), 1,973,261,720 distinct states found (1,827,317 ds/min), 153,981,317 states left on queue.
+Progress(46) at 2024-11-07 06:24:10: 26,083,986,220 states generated (28,948,166 s/min), 1,974,670,648 distinct states found (1,408,928 ds/min), 153,508,388 states left on queue.
+Progress(46) at 2024-11-07 06:25:10: 26,113,067,907 states generated (29,081,687 s/min), 1,976,391,547 distinct states found (1,720,899 ds/min), 153,263,845 states left on queue.
+Progress(46) at 2024-11-07 06:26:10: 26,142,186,839 states generated (29,118,932 s/min), 1,978,379,881 distinct states found (1,988,334 ds/min), 153,253,200 states left on queue.
+Progress(46) at 2024-11-07 06:27:10: 26,171,338,068 states generated (29,151,229 s/min), 1,980,293,569 distinct states found (1,913,688 ds/min), 153,185,559 states left on queue.
+Progress(46) at 2024-11-07 06:28:10: 26,200,319,869 states generated (28,981,801 s/min), 1,982,130,034 distinct states found (1,836,465 ds/min), 153,039,826 states left on queue.
+Progress(46) at 2024-11-07 06:29:10: 26,229,451,237 states generated (29,131,368 s/min), 1,984,117,981 distinct states found (1,987,947 ds/min), 153,030,792 states left on queue.
+Progress(46) at 2024-11-07 06:30:10: 26,258,476,767 states generated (29,025,530 s/min), 1,985,981,073 distinct states found (1,863,092 ds/min), 152,917,939 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 06:31:11)
+Progress(46) at 2024-11-07 06:31:11: 26,287,657,848 states generated (29,181,081 s/min), 1,987,875,178 distinct states found (1,894,105 ds/min), 152,784,901 states left on queue.
+Progress(46) at 2024-11-07 06:32:11: 26,316,549,803 states generated (28,891,955 s/min), 1,989,821,141 distinct states found (1,945,963 ds/min), 152,728,813 states left on queue.
+Progress(46) at 2024-11-07 06:33:11: 26,345,570,902 states generated (29,021,099 s/min), 1,991,762,973 distinct states found (1,941,832 ds/min), 152,648,662 states left on queue.
+Progress(46) at 2024-11-07 06:34:11: 26,374,519,051 states generated (28,948,149 s/min), 1,993,605,958 distinct states found (1,842,985 ds/min), 152,446,201 states left on queue.
+Progress(46) at 2024-11-07 06:35:11: 26,403,403,284 states generated (28,884,233 s/min), 1,995,379,328 distinct states found (1,773,370 ds/min), 152,189,032 states left on queue.
+Progress(46) at 2024-11-07 06:36:11: 26,432,512,518 states generated (29,109,234 s/min), 1,997,205,848 distinct states found (1,826,520 ds/min), 152,060,823 states left on queue.
+Progress(46) at 2024-11-07 06:37:11: 26,461,635,963 states generated (29,123,445 s/min), 1,999,221,288 distinct states found (2,015,440 ds/min), 152,052,317 states left on queue.
+Progress(46) at 2024-11-07 06:38:11: 26,490,692,408 states generated (29,056,445 s/min), 2,001,003,940 distinct states found (1,782,652 ds/min), 151,869,333 states left on queue.
+Progress(46) at 2024-11-07 06:39:11: 26,519,611,691 states generated (28,919,283 s/min), 2,002,772,264 distinct states found (1,768,324 ds/min), 151,637,576 states left on queue.
+Progress(46) at 2024-11-07 06:40:11: 26,548,405,773 states generated (28,794,082 s/min), 2,004,530,832 distinct states found (1,758,568 ds/min), 151,415,653 states left on queue.
+Progress(46) at 2024-11-07 06:41:11: 26,577,168,173 states generated (28,762,400 s/min), 2,006,431,383 distinct states found (1,900,551 ds/min), 151,293,655 states left on queue.
+Progress(46) at 2024-11-07 06:42:11: 26,606,013,565 states generated (28,845,392 s/min), 2,008,118,930 distinct states found (1,687,547 ds/min), 150,979,607 states left on queue.
+Progress(46) at 2024-11-07 06:43:11: 26,634,840,454 states generated (28,826,889 s/min), 2,010,033,233 distinct states found (1,914,303 ds/min), 150,859,233 states left on queue.
+Progress(46) at 2024-11-07 06:44:11: 26,663,791,564 states generated (28,951,110 s/min), 2,011,764,506 distinct states found (1,731,273 ds/min), 150,592,176 states left on queue.
+Progress(46) at 2024-11-07 06:45:11: 26,692,845,560 states generated (29,053,996 s/min), 2,013,541,948 distinct states found (1,777,442 ds/min), 150,346,125 states left on queue.
+Progress(46) at 2024-11-07 06:46:11: 26,721,838,462 states generated (28,992,902 s/min), 2,015,055,311 distinct states found (1,513,363 ds/min), 149,898,025 states left on queue.
+Progress(46) at 2024-11-07 06:47:11: 26,750,784,724 states generated (28,946,262 s/min), 2,016,795,791 distinct states found (1,740,480 ds/min), 149,636,143 states left on queue.
+Progress(46) at 2024-11-07 06:48:11: 26,779,537,729 states generated (28,753,005 s/min), 2,018,420,817 distinct states found (1,625,026 ds/min), 149,264,338 states left on queue.
+Progress(46) at 2024-11-07 06:49:11: 26,808,414,064 states generated (28,876,335 s/min), 2,019,941,133 distinct states found (1,520,316 ds/min), 148,833,851 states left on queue.
+Progress(46) at 2024-11-07 06:50:11: 26,837,552,895 states generated (29,138,831 s/min), 2,021,402,334 distinct states found (1,461,201 ds/min), 148,423,082 states left on queue.
+Progress(46) at 2024-11-07 06:51:11: 26,866,488,521 states generated (28,935,626 s/min), 2,022,896,299 distinct states found (1,493,965 ds/min), 148,037,640 states left on queue.
+Progress(46) at 2024-11-07 06:52:11: 26,895,259,654 states generated (28,771,133 s/min), 2,024,306,180 distinct states found (1,409,881 ds/min), 147,623,626 states left on queue.
+Progress(46) at 2024-11-07 06:53:11: 26,924,324,639 states generated (29,064,985 s/min), 2,025,751,691 distinct states found (1,445,511 ds/min), 147,237,191 states left on queue.
+Progress(46) at 2024-11-07 06:54:11: 26,953,575,306 states generated (29,250,667 s/min), 2,027,292,041 distinct states found (1,540,350 ds/min), 146,929,253 states left on queue.
+Progress(46) at 2024-11-07 06:55:11: 26,982,863,734 states generated (29,288,428 s/min), 2,029,056,116 distinct states found (1,764,075 ds/min), 146,774,179 states left on queue.
+Progress(46) at 2024-11-07 06:56:11: 27,012,217,899 states generated (29,354,165 s/min), 2,030,705,091 distinct states found (1,648,975 ds/min), 146,523,776 states left on queue.
+Progress(46) at 2024-11-07 06:57:11: 27,041,431,406 states generated (29,213,507 s/min), 2,032,122,917 distinct states found (1,417,826 ds/min), 146,066,712 states left on queue.
+Progress(46) at 2024-11-07 06:58:11: 27,070,230,233 states generated (28,798,827 s/min), 2,033,502,867 distinct states found (1,379,950 ds/min), 145,580,465 states left on queue.
+Progress(46) at 2024-11-07 06:59:11: 27,099,119,410 states generated (28,889,177 s/min), 2,035,080,295 distinct states found (1,577,428 ds/min), 145,255,429 states left on queue.
+Progress(46) at 2024-11-07 07:00:11: 27,127,802,546 states generated (28,683,136 s/min), 2,036,480,069 distinct states found (1,399,774 ds/min), 144,763,326 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 07:01:11)
+Progress(46) at 2024-11-07 07:01:11: 27,156,729,000 states generated (28,926,454 s/min), 2,037,888,171 distinct states found (1,408,102 ds/min), 144,282,188 states left on queue.
+Progress(46) at 2024-11-07 07:02:11: 27,185,673,878 states generated (28,944,878 s/min), 2,039,404,499 distinct states found (1,516,328 ds/min), 143,933,899 states left on queue.
+Progress(46) at 2024-11-07 07:03:11: 27,214,800,380 states generated (29,126,502 s/min), 2,040,991,907 distinct states found (1,587,408 ds/min), 143,736,528 states left on queue.
+Progress(46) at 2024-11-07 07:04:11: 27,243,805,336 states generated (29,004,956 s/min), 2,042,560,493 distinct states found (1,568,586 ds/min), 143,474,607 states left on queue.
+Progress(46) at 2024-11-07 07:05:11: 27,272,912,902 states generated (29,107,566 s/min), 2,044,549,687 distinct states found (1,989,194 ds/min), 143,501,934 states left on queue.
+Progress(46) at 2024-11-07 07:06:11: 27,301,850,628 states generated (28,937,726 s/min), 2,046,213,816 distinct states found (1,664,129 ds/min), 143,280,971 states left on queue.
+Progress(46) at 2024-11-07 07:07:11: 27,330,744,799 states generated (28,894,171 s/min), 2,047,777,121 distinct states found (1,563,305 ds/min), 142,943,602 states left on queue.
+Progress(46) at 2024-11-07 07:08:11: 27,359,855,477 states generated (29,110,678 s/min), 2,049,356,015 distinct states found (1,578,894 ds/min), 142,631,188 states left on queue.
+Progress(46) at 2024-11-07 07:09:11: 27,388,745,464 states generated (28,889,987 s/min), 2,050,822,496 distinct states found (1,466,481 ds/min), 142,190,439 states left on queue.
+Progress(46) at 2024-11-07 07:10:11: 27,417,576,550 states generated (28,831,086 s/min), 2,052,379,523 distinct states found (1,557,027 ds/min), 141,821,153 states left on queue.
+Progress(46) at 2024-11-07 07:11:11: 27,446,546,405 states generated (28,969,855 s/min), 2,053,934,499 distinct states found (1,554,976 ds/min), 141,462,097 states left on queue.
+Progress(46) at 2024-11-07 07:12:11: 27,475,398,683 states generated (28,852,278 s/min), 2,055,510,649 distinct states found (1,576,150 ds/min), 141,116,110 states left on queue.
+Progress(46) at 2024-11-07 07:13:11: 27,504,113,194 states generated (28,714,511 s/min), 2,057,051,677 distinct states found (1,541,028 ds/min), 140,743,906 states left on queue.
+Progress(46) at 2024-11-07 07:14:11: 27,532,983,174 states generated (28,869,980 s/min), 2,058,669,649 distinct states found (1,617,972 ds/min), 140,436,853 states left on queue.
+Progress(46) at 2024-11-07 07:15:11: 27,562,088,285 states generated (29,105,111 s/min), 2,060,404,146 distinct states found (1,734,497 ds/min), 140,213,296 states left on queue.
+Progress(46) at 2024-11-07 07:16:11: 27,591,079,273 states generated (28,990,988 s/min), 2,061,979,907 distinct states found (1,575,761 ds/min), 139,895,056 states left on queue.
+Progress(46) at 2024-11-07 07:17:11: 27,619,876,413 states generated (28,797,140 s/min), 2,063,482,225 distinct states found (1,502,318 ds/min), 139,506,174 states left on queue.
+Progress(46) at 2024-11-07 07:18:11: 27,648,595,649 states generated (28,719,236 s/min), 2,064,847,355 distinct states found (1,365,130 ds/min), 139,035,783 states left on queue.
+Progress(46) at 2024-11-07 07:19:11: 27,677,544,192 states generated (28,948,543 s/min), 2,066,507,355 distinct states found (1,660,000 ds/min), 138,783,592 states left on queue.
+Progress(46) at 2024-11-07 07:20:11: 27,706,306,461 states generated (28,762,269 s/min), 2,068,019,192 distinct states found (1,511,837 ds/min), 138,418,256 states left on queue.
+Progress(46) at 2024-11-07 07:21:11: 27,734,873,733 states generated (28,567,272 s/min), 2,069,467,142 distinct states found (1,447,950 ds/min), 137,977,630 states left on queue.
+Progress(46) at 2024-11-07 07:22:11: 27,763,678,204 states generated (28,804,471 s/min), 2,071,034,824 distinct states found (1,567,682 ds/min), 137,622,296 states left on queue.
+Progress(46) at 2024-11-07 07:23:11: 27,792,322,332 states generated (28,644,128 s/min), 2,072,586,226 distinct states found (1,551,402 ds/min), 137,231,762 states left on queue.
+Progress(46) at 2024-11-07 07:24:11: 27,821,040,127 states generated (28,717,795 s/min), 2,074,030,831 distinct states found (1,444,605 ds/min), 136,731,600 states left on queue.
+Progress(46) at 2024-11-07 07:25:11: 27,849,404,654 states generated (28,364,527 s/min), 2,075,273,409 distinct states found (1,242,578 ds/min), 136,082,131 states left on queue.
+Progress(46) at 2024-11-07 07:26:11: 27,878,356,417 states generated (28,951,763 s/min), 2,076,656,601 distinct states found (1,383,192 ds/min), 135,570,796 states left on queue.
+Progress(46) at 2024-11-07 07:27:11: 27,907,776,802 states generated (29,420,385 s/min), 2,078,383,391 distinct states found (1,726,790 ds/min), 135,306,248 states left on queue.
+Progress(46) at 2024-11-07 07:28:11: 27,937,070,294 states generated (29,293,492 s/min), 2,080,076,828 distinct states found (1,693,437 ds/min), 135,034,380 states left on queue.
+Progress(46) at 2024-11-07 07:29:11: 27,966,287,907 states generated (29,217,613 s/min), 2,081,855,223 distinct states found (1,778,395 ds/min), 134,839,763 states left on queue.
+Progress(46) at 2024-11-07 07:30:11: 27,995,330,759 states generated (29,042,852 s/min), 2,083,372,197 distinct states found (1,516,974 ds/min), 134,461,641 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 07:31:12)
+Progress(46) at 2024-11-07 07:31:12: 28,024,387,579 states generated (29,056,820 s/min), 2,085,018,193 distinct states found (1,645,996 ds/min), 134,150,126 states left on queue.
+Progress(46) at 2024-11-07 07:32:12: 28,053,564,379 states generated (29,176,800 s/min), 2,086,850,158 distinct states found (1,831,965 ds/min), 133,983,084 states left on queue.
+Progress(46) at 2024-11-07 07:33:12: 28,082,556,747 states generated (28,992,368 s/min), 2,088,444,271 distinct states found (1,594,113 ds/min), 133,651,269 states left on queue.
+Progress(47) at 2024-11-07 07:34:12: 28,111,323,007 states generated (28,766,260 s/min), 2,090,072,790 distinct states found (1,628,519 ds/min), 133,350,640 states left on queue.
+Progress(47) at 2024-11-07 07:35:12: 28,140,191,163 states generated (28,868,156 s/min), 2,091,740,224 distinct states found (1,667,434 ds/min), 133,070,266 states left on queue.
+Progress(47) at 2024-11-07 07:36:12: 28,169,054,601 states generated (28,863,438 s/min), 2,093,375,975 distinct states found (1,635,751 ds/min), 132,752,319 states left on queue.
+Progress(47) at 2024-11-07 07:37:12: 28,197,994,162 states generated (28,939,561 s/min), 2,094,929,793 distinct states found (1,553,818 ds/min), 132,356,738 states left on queue.
+Progress(47) at 2024-11-07 07:38:12: 28,226,808,491 states generated (28,814,329 s/min), 2,096,311,441 distinct states found (1,381,648 ds/min), 131,832,292 states left on queue.
+Progress(47) at 2024-11-07 07:39:12: 28,255,451,016 states generated (28,642,525 s/min), 2,097,907,185 distinct states found (1,595,744 ds/min), 131,487,862 states left on queue.
+Progress(47) at 2024-11-07 07:40:12: 28,284,015,286 states generated (28,564,270 s/min), 2,099,332,452 distinct states found (1,425,267 ds/min), 130,982,897 states left on queue.
+Progress(47) at 2024-11-07 07:41:12: 28,313,051,806 states generated (29,036,520 s/min), 2,101,053,792 distinct states found (1,721,340 ds/min), 130,744,522 states left on queue.
+Progress(47) at 2024-11-07 07:42:12: 28,342,348,160 states generated (29,296,354 s/min), 2,102,778,970 distinct states found (1,725,178 ds/min), 130,505,777 states left on queue.
+Progress(47) at 2024-11-07 07:43:12: 28,371,533,935 states generated (29,185,775 s/min), 2,104,337,778 distinct states found (1,558,808 ds/min), 130,144,304 states left on queue.
+Progress(47) at 2024-11-07 07:44:12: 28,400,351,066 states generated (28,817,131 s/min), 2,105,835,284 distinct states found (1,497,506 ds/min), 129,719,871 states left on queue.
+Progress(47) at 2024-11-07 07:45:12: 28,429,411,463 states generated (29,060,397 s/min), 2,107,704,752 distinct states found (1,869,468 ds/min), 129,618,749 states left on queue.
+Progress(47) at 2024-11-07 07:46:12: 28,458,488,093 states generated (29,076,630 s/min), 2,109,483,825 distinct states found (1,779,073 ds/min), 129,439,723 states left on queue.
+Progress(47) at 2024-11-07 07:47:12: 28,487,338,391 states generated (28,850,298 s/min), 2,111,230,358 distinct states found (1,746,533 ds/min), 129,232,124 states left on queue.
+Progress(47) at 2024-11-07 07:48:12: 28,516,411,931 states generated (29,073,540 s/min), 2,113,150,785 distinct states found (1,920,427 ds/min), 129,168,385 states left on queue.
+Progress(47) at 2024-11-07 07:49:12: 28,545,299,037 states generated (28,887,106 s/min), 2,114,878,071 distinct states found (1,727,286 ds/min), 128,948,735 states left on queue.
+Progress(47) at 2024-11-07 07:50:12: 28,574,186,091 states generated (28,887,054 s/min), 2,116,622,746 distinct states found (1,744,675 ds/min), 128,711,386 states left on queue.
+Progress(47) at 2024-11-07 07:51:12: 28,603,057,442 states generated (28,871,351 s/min), 2,118,435,710 distinct states found (1,812,964 ds/min), 128,543,573 states left on queue.
+Progress(47) at 2024-11-07 07:52:12: 28,632,042,720 states generated (28,985,278 s/min), 2,120,240,818 distinct states found (1,805,108 ds/min), 128,349,742 states left on queue.
+Progress(47) at 2024-11-07 07:53:12: 28,660,885,097 states generated (28,842,377 s/min), 2,121,904,885 distinct states found (1,664,067 ds/min), 128,002,987 states left on queue.
+Progress(47) at 2024-11-07 07:54:12: 28,689,690,902 states generated (28,805,805 s/min), 2,123,498,767 distinct states found (1,593,882 ds/min), 127,622,035 states left on queue.
+Progress(47) at 2024-11-07 07:55:12: 28,718,827,206 states generated (29,136,304 s/min), 2,125,375,087 distinct states found (1,876,320 ds/min), 127,518,682 states left on queue.
+Progress(47) at 2024-11-07 07:56:12: 28,747,988,287 states generated (29,161,081 s/min), 2,127,234,055 distinct states found (1,858,968 ds/min), 127,390,123 states left on queue.
+Progress(47) at 2024-11-07 07:57:12: 28,776,918,449 states generated (28,930,162 s/min), 2,128,896,639 distinct states found (1,662,584 ds/min), 127,099,202 states left on queue.
+Progress(47) at 2024-11-07 07:58:12: 28,805,826,521 states generated (28,908,072 s/min), 2,130,485,896 distinct states found (1,589,257 ds/min), 126,731,846 states left on queue.
+Progress(47) at 2024-11-07 07:59:12: 28,834,550,061 states generated (28,723,540 s/min), 2,132,267,049 distinct states found (1,781,153 ds/min), 126,524,859 states left on queue.
+Progress(47) at 2024-11-07 08:00:12: 28,863,218,037 states generated (28,667,976 s/min), 2,133,901,471 distinct states found (1,634,422 ds/min), 126,149,810 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 08:01:13)
+Progress(47) at 2024-11-07 08:01:13: 28,892,405,277 states generated (29,187,240 s/min), 2,135,683,266 distinct states found (1,781,795 ds/min), 125,938,046 states left on queue.
+Progress(47) at 2024-11-07 08:02:13: 28,921,188,007 states generated (28,782,730 s/min), 2,137,299,589 distinct states found (1,616,323 ds/min), 125,575,223 states left on queue.
+Progress(47) at 2024-11-07 08:03:13: 28,950,198,581 states generated (29,010,574 s/min), 2,138,945,715 distinct states found (1,646,126 ds/min), 125,225,825 states left on queue.
+Progress(47) at 2024-11-07 08:04:13: 28,979,052,322 states generated (28,853,741 s/min), 2,140,384,312 distinct states found (1,438,597 ds/min), 124,739,890 states left on queue.
+Progress(47) at 2024-11-07 08:05:13: 29,007,862,556 states generated (28,810,234 s/min), 2,142,020,690 distinct states found (1,636,378 ds/min), 124,389,570 states left on queue.
+Progress(47) at 2024-11-07 08:06:13: 29,036,639,997 states generated (28,777,441 s/min), 2,143,436,769 distinct states found (1,416,079 ds/min), 123,853,456 states left on queue.
+Progress(47) at 2024-11-07 08:07:13: 29,065,681,489 states generated (29,041,492 s/min), 2,144,841,718 distinct states found (1,404,949 ds/min), 123,385,042 states left on queue.
+Progress(47) at 2024-11-07 08:08:13: 29,094,462,032 states generated (28,780,543 s/min), 2,146,214,867 distinct states found (1,373,149 ds/min), 122,908,921 states left on queue.
+Progress(47) at 2024-11-07 08:09:13: 29,123,289,758 states generated (28,827,726 s/min), 2,147,553,984 distinct states found (1,339,117 ds/min), 122,446,193 states left on queue.
+Progress(47) at 2024-11-07 08:10:13: 29,152,503,386 states generated (29,213,628 s/min), 2,148,942,911 distinct states found (1,388,927 ds/min), 122,030,640 states left on queue.
+Progress(47) at 2024-11-07 08:11:13: 29,181,728,737 states generated (29,225,351 s/min), 2,150,631,619 distinct states found (1,688,708 ds/min), 121,816,919 states left on queue.
+Progress(47) at 2024-11-07 08:12:13: 29,211,003,478 states generated (29,274,741 s/min), 2,152,175,254 distinct states found (1,543,635 ds/min), 121,489,774 states left on queue.
+Progress(47) at 2024-11-07 08:13:13: 29,240,102,268 states generated (29,098,790 s/min), 2,153,537,952 distinct states found (1,362,698 ds/min), 120,992,206 states left on queue.
+Progress(47) at 2024-11-07 08:14:13: 29,268,843,458 states generated (28,741,190 s/min), 2,154,896,522 distinct states found (1,358,570 ds/min), 120,481,830 states left on queue.
+Progress(47) at 2024-11-07 08:15:13: 29,297,458,982 states generated (28,615,524 s/min), 2,156,228,693 distinct states found (1,332,171 ds/min), 119,935,590 states left on queue.
+Progress(47) at 2024-11-07 08:16:13: 29,326,133,934 states generated (28,674,952 s/min), 2,157,558,222 distinct states found (1,329,529 ds/min), 119,402,611 states left on queue.
+Progress(47) at 2024-11-07 08:17:13: 29,355,133,179 states generated (28,999,245 s/min), 2,159,036,229 distinct states found (1,478,007 ds/min), 119,059,305 states left on queue.
+Progress(47) at 2024-11-07 08:18:13: 29,384,094,216 states generated (28,961,037 s/min), 2,160,401,726 distinct states found (1,365,497 ds/min), 118,659,528 states left on queue.
+Progress(47) at 2024-11-07 08:19:13: 29,413,210,497 states generated (29,116,281 s/min), 2,162,252,062 distinct states found (1,850,336 ds/min), 118,605,990 states left on queue.
+Progress(47) at 2024-11-07 08:20:13: 29,442,123,726 states generated (28,913,229 s/min), 2,163,968,572 distinct states found (1,716,510 ds/min), 118,430,828 states left on queue.
+Progress(47) at 2024-11-07 08:21:13: 29,470,933,813 states generated (28,810,087 s/min), 2,165,411,802 distinct states found (1,443,230 ds/min), 118,017,068 states left on queue.
+Progress(47) at 2024-11-07 08:22:13: 29,499,968,878 states generated (29,035,065 s/min), 2,166,884,069 distinct states found (1,472,267 ds/min), 117,620,342 states left on queue.
+Progress(47) at 2024-11-07 08:23:13: 29,528,752,811 states generated (28,783,933 s/min), 2,168,252,577 distinct states found (1,368,508 ds/min), 117,101,560 states left on queue.
+Progress(47) at 2024-11-07 08:24:13: 29,557,568,598 states generated (28,815,787 s/min), 2,169,705,158 distinct states found (1,452,581 ds/min), 116,651,662 states left on queue.
+Progress(47) at 2024-11-07 08:25:13: 29,586,373,945 states generated (28,805,347 s/min), 2,171,138,563 distinct states found (1,433,405 ds/min), 116,184,414 states left on queue.
+Progress(47) at 2024-11-07 08:26:13: 29,614,983,668 states generated (28,609,723 s/min), 2,172,585,802 distinct states found (1,447,239 ds/min), 115,737,683 states left on queue.
+Progress(47) at 2024-11-07 08:27:13: 29,643,800,320 states generated (28,816,652 s/min), 2,174,078,381 distinct states found (1,492,579 ds/min), 115,326,021 states left on queue.
+Progress(47) at 2024-11-07 08:28:13: 29,672,907,645 states generated (29,107,325 s/min), 2,175,677,520 distinct states found (1,599,139 ds/min), 114,997,885 states left on queue.
+Progress(47) at 2024-11-07 08:29:13: 29,701,705,556 states generated (28,797,911 s/min), 2,177,190,856 distinct states found (1,513,336 ds/min), 114,628,087 states left on queue.
+Progress(47) at 2024-11-07 08:30:13: 29,730,412,995 states generated (28,707,439 s/min), 2,178,512,609 distinct states found (1,321,753 ds/min), 114,087,841 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 08:31:13)
+Progress(47) at 2024-11-07 08:31:13: 29,759,387,383 states generated (28,974,388 s/min), 2,180,040,685 distinct states found (1,528,076 ds/min), 113,747,306 states left on queue.
+Progress(47) at 2024-11-07 08:32:13: 29,788,001,065 states generated (28,613,682 s/min), 2,181,419,698 distinct states found (1,379,013 ds/min), 113,276,454 states left on queue.
+Progress(47) at 2024-11-07 08:33:13: 29,816,483,253 states generated (28,482,188 s/min), 2,182,794,509 distinct states found (1,374,811 ds/min), 112,764,181 states left on queue.
+Progress(47) at 2024-11-07 08:34:13: 29,845,032,133 states generated (28,548,880 s/min), 2,184,180,577 distinct states found (1,386,068 ds/min), 112,275,854 states left on queue.
+Progress(47) at 2024-11-07 08:35:13: 29,873,704,121 states generated (28,671,988 s/min), 2,185,610,616 distinct states found (1,430,039 ds/min), 111,765,886 states left on queue.
+Progress(47) at 2024-11-07 08:36:13: 29,901,983,007 states generated (28,278,886 s/min), 2,186,742,865 distinct states found (1,132,249 ds/min), 111,037,502 states left on queue.
+Progress(47) at 2024-11-07 08:37:13: 29,931,128,222 states generated (29,145,215 s/min), 2,188,247,053 distinct states found (1,504,188 ds/min), 110,610,871 states left on queue.
+Progress(47) at 2024-11-07 08:38:13: 29,960,291,600 states generated (29,163,378 s/min), 2,189,791,380 distinct states found (1,544,327 ds/min), 110,219,347 states left on queue.
+Progress(47) at 2024-11-07 08:39:13: 29,989,426,093 states generated (29,134,493 s/min), 2,191,523,686 distinct states found (1,732,306 ds/min), 109,988,090 states left on queue.
+Progress(47) at 2024-11-07 08:40:13: 30,018,419,613 states generated (28,993,520 s/min), 2,192,983,724 distinct states found (1,460,038 ds/min), 109,567,153 states left on queue.
+Progress(47) at 2024-11-07 08:41:13: 30,047,169,261 states generated (28,749,648 s/min), 2,194,485,325 distinct states found (1,501,601 ds/min), 109,159,610 states left on queue.
+Progress(47) at 2024-11-07 08:42:13: 30,076,320,011 states generated (29,150,750 s/min), 2,196,261,852 distinct states found (1,776,527 ds/min), 108,952,775 states left on queue.
+Progress(47) at 2024-11-07 08:43:13: 30,105,246,939 states generated (28,926,928 s/min), 2,197,745,917 distinct states found (1,484,065 ds/min), 108,533,801 states left on queue.
+Progress(47) at 2024-11-07 08:44:13: 30,134,017,722 states generated (28,770,783 s/min), 2,199,274,846 distinct states found (1,528,929 ds/min), 108,138,210 states left on queue.
+Progress(48) at 2024-11-07 08:45:13: 30,162,850,009 states generated (28,832,287 s/min), 2,200,818,695 distinct states found (1,543,849 ds/min), 107,749,686 states left on queue.
+Progress(48) at 2024-11-07 08:46:13: 30,191,763,541 states generated (28,913,532 s/min), 2,202,269,881 distinct states found (1,451,186 ds/min), 107,274,074 states left on queue.
+Progress(48) at 2024-11-07 08:47:13: 30,220,450,821 states generated (28,687,280 s/min), 2,203,579,369 distinct states found (1,309,488 ds/min), 106,693,506 states left on queue.
+Progress(48) at 2024-11-07 08:48:13: 30,249,109,647 states generated (28,658,826 s/min), 2,204,980,828 distinct states found (1,401,459 ds/min), 106,171,815 states left on queue.
+Progress(48) at 2024-11-07 08:49:13: 30,278,004,502 states generated (28,894,855 s/min), 2,206,546,641 distinct states found (1,565,813 ds/min), 105,800,017 states left on queue.
+Progress(48) at 2024-11-07 08:50:13: 30,307,176,628 states generated (29,172,126 s/min), 2,208,173,395 distinct states found (1,626,754 ds/min), 105,492,735 states left on queue.
+Progress(48) at 2024-11-07 08:51:13: 30,336,267,563 states generated (29,090,935 s/min), 2,209,629,083 distinct states found (1,455,688 ds/min), 105,046,561 states left on queue.
+Progress(48) at 2024-11-07 08:52:13: 30,365,237,300 states generated (28,969,737 s/min), 2,211,221,126 distinct states found (1,592,043 ds/min), 104,707,696 states left on queue.
+Progress(48) at 2024-11-07 08:53:13: 30,394,270,909 states generated (29,033,609 s/min), 2,212,948,437 distinct states found (1,727,311 ds/min), 104,490,104 states left on queue.
+Progress(48) at 2024-11-07 08:54:13: 30,423,140,115 states generated (28,869,206 s/min), 2,214,640,116 distinct states found (1,691,679 ds/min), 104,243,061 states left on queue.
+Progress(48) at 2024-11-07 08:55:13: 30,452,062,605 states generated (28,922,490 s/min), 2,216,327,939 distinct states found (1,687,823 ds/min), 103,983,745 states left on queue.
+Progress(48) at 2024-11-07 08:56:13: 30,481,071,056 states generated (29,008,451 s/min), 2,217,983,905 distinct states found (1,655,966 ds/min), 103,702,586 states left on queue.
+Progress(48) at 2024-11-07 08:57:13: 30,509,808,031 states generated (28,736,975 s/min), 2,219,662,593 distinct states found (1,678,688 ds/min), 103,423,522 states left on queue.
+Progress(48) at 2024-11-07 08:58:13: 30,538,616,862 states generated (28,808,831 s/min), 2,221,288,821 distinct states found (1,626,228 ds/min), 103,098,334 states left on queue.
+Progress(48) at 2024-11-07 08:59:13: 30,567,539,949 states generated (28,923,087 s/min), 2,222,969,669 distinct states found (1,680,848 ds/min), 102,811,145 states left on queue.
+Progress(48) at 2024-11-07 09:00:13: 30,596,220,572 states generated (28,680,623 s/min), 2,224,451,086 distinct states found (1,481,417 ds/min), 102,320,643 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 09:01:14)
+Progress(48) at 2024-11-07 09:01:14: 30,625,254,005 states generated (29,033,433 s/min), 2,225,971,213 distinct states found (1,520,127 ds/min), 101,895,678 states left on queue.
+Progress(48) at 2024-11-07 09:02:14: 30,654,316,875 states generated (29,062,870 s/min), 2,227,776,007 distinct states found (1,804,794 ds/min), 101,720,925 states left on queue.
+Progress(48) at 2024-11-07 09:03:14: 30,683,368,837 states generated (29,051,962 s/min), 2,229,520,592 distinct states found (1,744,585 ds/min), 101,516,049 states left on queue.
+Progress(48) at 2024-11-07 09:04:14: 30,712,221,770 states generated (28,852,933 s/min), 2,231,006,576 distinct states found (1,485,984 ds/min), 101,059,951 states left on queue.
+Progress(48) at 2024-11-07 09:05:14: 30,740,916,958 states generated (28,695,188 s/min), 2,232,634,565 distinct states found (1,627,989 ds/min), 100,742,863 states left on queue.
+Progress(48) at 2024-11-07 09:06:14: 30,769,477,527 states generated (28,560,569 s/min), 2,234,099,495 distinct states found (1,464,930 ds/min), 100,237,731 states left on queue.
+Progress(48) at 2024-11-07 09:07:14: 30,798,306,365 states generated (28,828,838 s/min), 2,235,757,510 distinct states found (1,658,015 ds/min), 99,936,798 states left on queue.
+Progress(48) at 2024-11-07 09:08:14: 30,827,145,014 states generated (28,838,649 s/min), 2,237,323,374 distinct states found (1,565,864 ds/min), 99,542,928 states left on queue.
+Progress(48) at 2024-11-07 09:09:14: 30,855,967,384 states generated (28,822,370 s/min), 2,238,712,445 distinct states found (1,389,071 ds/min), 98,994,892 states left on queue.
+Progress(48) at 2024-11-07 09:10:14: 30,884,757,904 states generated (28,790,520 s/min), 2,240,211,537 distinct states found (1,499,092 ds/min), 98,555,003 states left on queue.
+Progress(48) at 2024-11-07 09:11:14: 30,913,436,301 states generated (28,678,397 s/min), 2,241,549,402 distinct states found (1,337,865 ds/min), 97,972,368 states left on queue.
+Progress(48) at 2024-11-07 09:12:14: 30,942,398,628 states generated (28,962,327 s/min), 2,242,894,478 distinct states found (1,345,076 ds/min), 97,450,191 states left on queue.
+Progress(48) at 2024-11-07 09:13:14: 30,971,150,912 states generated (28,752,284 s/min), 2,244,149,533 distinct states found (1,255,055 ds/min), 96,915,440 states left on queue.
+Progress(48) at 2024-11-07 09:14:14: 31,000,226,695 states generated (29,075,783 s/min), 2,245,486,253 distinct states found (1,336,720 ds/min), 96,453,711 states left on queue.
+Progress(48) at 2024-11-07 09:15:14: 31,029,410,660 states generated (29,183,965 s/min), 2,247,033,348 distinct states found (1,547,095 ds/min), 96,134,910 states left on queue.
+Progress(48) at 2024-11-07 09:16:14: 31,058,657,395 states generated (29,246,735 s/min), 2,248,447,081 distinct states found (1,413,733 ds/min), 95,701,875 states left on queue.
+Progress(48) at 2024-11-07 09:17:14: 31,087,368,874 states generated (28,711,479 s/min), 2,249,703,997 distinct states found (1,256,916 ds/min), 95,112,797 states left on queue.
+Progress(48) at 2024-11-07 09:18:14: 31,115,905,907 states generated (28,537,033 s/min), 2,250,949,093 distinct states found (1,245,096 ds/min), 94,499,889 states left on queue.
+Progress(48) at 2024-11-07 09:19:14: 31,144,578,992 states generated (28,673,085 s/min), 2,252,226,995 distinct states found (1,277,902 ds/min), 93,927,098 states left on queue.
+Progress(48) at 2024-11-07 09:20:14: 31,173,557,966 states generated (28,978,974 s/min), 2,253,602,196 distinct states found (1,375,201 ds/min), 93,561,559 states left on queue.
+Progress(48) at 2024-11-07 09:21:14: 31,202,521,307 states generated (28,963,341 s/min), 2,255,224,149 distinct states found (1,621,953 ds/min), 93,337,000 states left on queue.
+Progress(48) at 2024-11-07 09:22:14: 31,231,451,884 states generated (28,930,577 s/min), 2,256,879,564 distinct states found (1,655,415 ds/min), 93,119,996 states left on queue.
+Progress(48) at 2024-11-07 09:23:14: 31,260,174,245 states generated (28,722,361 s/min), 2,258,206,514 distinct states found (1,326,950 ds/min), 92,610,216 states left on queue.
+Progress(48) at 2024-11-07 09:24:14: 31,289,091,475 states generated (28,917,230 s/min), 2,259,564,810 distinct states found (1,358,296 ds/min), 92,123,452 states left on queue.
+Progress(48) at 2024-11-07 09:25:14: 31,317,753,943 states generated (28,662,468 s/min), 2,260,868,559 distinct states found (1,303,749 ds/min), 91,550,997 states left on queue.
+Progress(48) at 2024-11-07 09:26:14: 31,346,435,672 states generated (28,681,729 s/min), 2,262,197,433 distinct states found (1,328,874 ds/min), 91,002,731 states left on queue.
+Progress(48) at 2024-11-07 09:27:14: 31,375,074,275 states generated (28,638,603 s/min), 2,263,549,308 distinct states found (1,351,875 ds/min), 90,479,028 states left on queue.
+Progress(48) at 2024-11-07 09:28:14: 31,403,896,903 states generated (28,822,628 s/min), 2,264,999,048 distinct states found (1,449,740 ds/min), 90,030,284 states left on queue.
+Progress(48) at 2024-11-07 09:29:14: 31,432,772,052 states generated (28,875,149 s/min), 2,266,431,878 distinct states found (1,432,830 ds/min), 89,580,165 states left on queue.
+Progress(48) at 2024-11-07 09:30:14: 31,461,382,905 states generated (28,610,853 s/min), 2,267,701,315 distinct states found (1,269,437 ds/min), 89,008,135 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 09:31:15)
+Progress(48) at 2024-11-07 09:31:15: 31,490,350,002 states generated (28,967,097 s/min), 2,269,120,991 distinct states found (1,419,676 ds/min), 88,574,899 states left on queue.
+Progress(48) at 2024-11-07 09:32:15: 31,518,738,286 states generated (28,388,284 s/min), 2,270,333,667 distinct states found (1,212,676 ds/min), 87,950,800 states left on queue.
+Progress(48) at 2024-11-07 09:33:15: 31,547,227,429 states generated (28,489,143 s/min), 2,271,632,491 distinct states found (1,298,824 ds/min), 87,379,110 states left on queue.
+Progress(48) at 2024-11-07 09:34:15: 31,575,696,846 states generated (28,469,417 s/min), 2,272,873,166 distinct states found (1,240,675 ds/min), 86,717,955 states left on queue.
+Progress(48) at 2024-11-07 09:35:15: 31,604,509,248 states generated (28,812,402 s/min), 2,274,166,128 distinct states found (1,292,962 ds/min), 86,122,414 states left on queue.
+Progress(48) at 2024-11-07 09:36:15: 31,633,623,894 states generated (29,114,646 s/min), 2,275,690,739 distinct states found (1,524,611 ds/min), 85,718,820 states left on queue.
+Progress(48) at 2024-11-07 09:37:15: 31,662,734,164 states generated (29,110,270 s/min), 2,277,282,041 distinct states found (1,591,302 ds/min), 85,389,121 states left on queue.
+Progress(48) at 2024-11-07 09:38:15: 31,691,488,753 states generated (28,754,589 s/min), 2,278,666,982 distinct states found (1,384,941 ds/min), 84,903,119 states left on queue.
+Progress(48) at 2024-11-07 09:39:15: 31,720,428,706 states generated (28,939,953 s/min), 2,280,231,311 distinct states found (1,564,329 ds/min), 84,529,794 states left on queue.
+Progress(48) at 2024-11-07 09:40:15: 31,749,336,886 states generated (28,908,180 s/min), 2,281,688,218 distinct states found (1,456,907 ds/min), 84,091,511 states left on queue.
+Progress(48) at 2024-11-07 09:41:15: 31,778,054,342 states generated (28,717,456 s/min), 2,283,102,693 distinct states found (1,414,475 ds/min), 83,605,316 states left on queue.
+Progress(49) at 2024-11-07 09:42:15: 31,806,874,604 states generated (28,820,262 s/min), 2,284,525,902 distinct states found (1,423,209 ds/min), 83,115,134 states left on queue.
+Progress(49) at 2024-11-07 09:43:15: 31,835,557,645 states generated (28,683,041 s/min), 2,285,776,893 distinct states found (1,250,991 ds/min), 82,491,419 states left on queue.
+Progress(49) at 2024-11-07 09:44:15: 31,864,075,450 states generated (28,517,805 s/min), 2,287,028,991 distinct states found (1,252,098 ds/min), 81,847,819 states left on queue.
+Progress(49) at 2024-11-07 09:45:15: 31,892,999,186 states generated (28,923,736 s/min), 2,288,552,140 distinct states found (1,523,149 ds/min), 81,459,937 states left on queue.
+Progress(49) at 2024-11-07 09:46:15: 31,922,276,996 states generated (29,277,810 s/min), 2,290,137,668 distinct states found (1,585,528 ds/min), 81,115,285 states left on queue.
+Progress(49) at 2024-11-07 09:47:15: 31,951,109,751 states generated (28,832,755 s/min), 2,291,477,001 distinct states found (1,339,333 ds/min), 80,582,606 states left on queue.
+Progress(49) at 2024-11-07 09:48:15: 31,980,103,122 states generated (28,993,371 s/min), 2,293,149,633 distinct states found (1,672,632 ds/min), 80,321,900 states left on queue.
+Progress(49) at 2024-11-07 09:49:15: 32,008,927,227 states generated (28,824,105 s/min), 2,294,737,299 distinct states found (1,587,666 ds/min), 79,988,982 states left on queue.
+Progress(49) at 2024-11-07 09:50:15: 32,037,912,405 states generated (28,985,178 s/min), 2,296,369,269 distinct states found (1,631,970 ds/min), 79,688,340 states left on queue.
+Progress(49) at 2024-11-07 09:51:15: 32,066,650,871 states generated (28,738,466 s/min), 2,297,881,682 distinct states found (1,512,413 ds/min), 79,285,058 states left on queue.
+Progress(49) at 2024-11-07 09:52:15: 32,095,474,869 states generated (28,823,998 s/min), 2,299,386,856 distinct states found (1,505,174 ds/min), 78,860,285 states left on queue.
+Progress(49) at 2024-11-07 09:53:15: 32,124,254,306 states generated (28,779,437 s/min), 2,300,974,245 distinct states found (1,587,389 ds/min), 78,501,509 states left on queue.
+Progress(49) at 2024-11-07 09:54:15: 32,152,874,934 states generated (28,620,628 s/min), 2,302,313,494 distinct states found (1,339,249 ds/min), 77,908,264 states left on queue.
+Progress(49) at 2024-11-07 09:55:15: 32,181,625,656 states generated (28,750,722 s/min), 2,303,719,911 distinct states found (1,406,417 ds/min), 77,409,147 states left on queue.
+Progress(49) at 2024-11-07 09:56:15: 32,210,690,682 states generated (29,065,026 s/min), 2,305,458,559 distinct states found (1,738,648 ds/min), 77,178,015 states left on queue.
+Progress(49) at 2024-11-07 09:57:15: 32,239,586,160 states generated (28,895,478 s/min), 2,307,003,156 distinct states found (1,544,597 ds/min), 76,805,818 states left on queue.
+Progress(49) at 2024-11-07 09:58:15: 32,268,327,819 states generated (28,741,659 s/min), 2,308,436,891 distinct states found (1,433,735 ds/min), 76,324,212 states left on queue.
+Progress(49) at 2024-11-07 09:59:15: 32,296,829,379 states generated (28,501,560 s/min), 2,309,831,948 distinct states found (1,395,057 ds/min), 75,779,735 states left on queue.
+Progress(49) at 2024-11-07 10:00:15: 32,325,628,397 states generated (28,799,018 s/min), 2,311,380,882 distinct states found (1,548,934 ds/min), 75,395,162 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 10:01:15)
+Progress(49) at 2024-11-07 10:01:15: 32,354,681,149 states generated (29,052,752 s/min), 2,312,867,979 distinct states found (1,487,097 ds/min), 74,928,503 states left on queue.
+Progress(49) at 2024-11-07 10:02:15: 32,383,406,034 states generated (28,724,885 s/min), 2,314,202,265 distinct states found (1,334,286 ds/min), 74,352,680 states left on queue.
+Progress(49) at 2024-11-07 10:03:15: 32,411,997,317 states generated (28,591,283 s/min), 2,315,435,082 distinct states found (1,232,817 ds/min), 73,700,708 states left on queue.
+Progress(49) at 2024-11-07 10:04:15: 32,440,769,297 states generated (28,771,980 s/min), 2,316,687,791 distinct states found (1,252,709 ds/min), 73,114,003 states left on queue.
+Progress(49) at 2024-11-07 10:05:15: 32,469,733,062 states generated (28,963,765 s/min), 2,317,885,762 distinct states found (1,197,971 ds/min), 72,558,372 states left on queue.
+Progress(49) at 2024-11-07 10:06:15: 32,498,863,740 states generated (29,130,678 s/min), 2,319,353,511 distinct states found (1,467,749 ds/min), 72,186,248 states left on queue.
+Progress(49) at 2024-11-07 10:07:15: 32,527,902,407 states generated (29,038,667 s/min), 2,320,635,445 distinct states found (1,281,934 ds/min), 71,639,893 states left on queue.
+Progress(49) at 2024-11-07 10:08:15: 32,556,361,400 states generated (28,458,993 s/min), 2,321,793,726 distinct states found (1,158,281 ds/min), 70,954,333 states left on queue.
+Progress(49) at 2024-11-07 10:09:15: 32,585,056,251 states generated (28,694,851 s/min), 2,323,009,155 distinct states found (1,215,429 ds/min), 70,362,671 states left on queue.
+Progress(49) at 2024-11-07 10:10:15: 32,613,972,815 states generated (28,916,564 s/min), 2,324,321,084 distinct states found (1,311,929 ds/min), 69,935,186 states left on queue.
+Progress(49) at 2024-11-07 10:11:15: 32,642,963,038 states generated (28,990,223 s/min), 2,325,997,874 distinct states found (1,676,790 ds/min), 69,730,871 states left on queue.
+Progress(49) at 2024-11-07 10:12:15: 32,671,642,762 states generated (28,679,724 s/min), 2,327,294,217 distinct states found (1,296,343 ds/min), 69,221,413 states left on queue.
+Progress(49) at 2024-11-07 10:13:15: 32,700,429,296 states generated (28,786,534 s/min), 2,328,535,742 distinct states found (1,241,525 ds/min), 68,635,066 states left on queue.
+Progress(49) at 2024-11-07 10:14:15: 32,729,076,182 states generated (28,646,886 s/min), 2,329,760,071 distinct states found (1,224,329 ds/min), 67,997,735 states left on queue.
+Progress(49) at 2024-11-07 10:15:15: 32,757,631,787 states generated (28,555,605 s/min), 2,331,002,517 distinct states found (1,242,446 ds/min), 67,379,374 states left on queue.
+Progress(49) at 2024-11-07 10:16:15: 32,786,472,553 states generated (28,840,766 s/min), 2,332,364,440 distinct states found (1,361,923 ds/min), 66,856,953 states left on queue.
+Progress(49) at 2024-11-07 10:17:15: 32,815,068,782 states generated (28,596,229 s/min), 2,333,629,799 distinct states found (1,265,359 ds/min), 66,266,973 states left on queue.
+Progress(49) at 2024-11-07 10:18:15: 32,843,671,035 states generated (28,602,253 s/min), 2,334,875,787 distinct states found (1,245,988 ds/min), 65,714,901 states left on queue.
+Progress(49) at 2024-11-07 10:19:15: 32,872,127,728 states generated (28,456,693 s/min), 2,336,030,334 distinct states found (1,154,547 ds/min), 65,023,805 states left on queue.
+Progress(49) at 2024-11-07 10:20:15: 32,900,582,167 states generated (28,454,439 s/min), 2,337,180,611 distinct states found (1,150,277 ds/min), 64,304,348 states left on queue.
+Progress(49) at 2024-11-07 10:21:15: 32,929,545,972 states generated (28,963,805 s/min), 2,338,488,833 distinct states found (1,308,222 ds/min), 63,715,470 states left on queue.
+Progress(49) at 2024-11-07 10:22:15: 32,958,603,673 states generated (29,057,701 s/min), 2,339,992,330 distinct states found (1,503,497 ds/min), 63,307,968 states left on queue.
+Progress(49) at 2024-11-07 10:23:15: 32,987,442,078 states generated (28,838,405 s/min), 2,341,335,966 distinct states found (1,343,636 ds/min), 62,792,292 states left on queue.
+Progress(49) at 2024-11-07 10:24:15: 33,016,381,018 states generated (28,938,940 s/min), 2,342,828,482 distinct states found (1,492,516 ds/min), 62,365,394 states left on queue.
+Progress(49) at 2024-11-07 10:25:15: 33,045,061,128 states generated (28,680,110 s/min), 2,344,118,515 distinct states found (1,290,033 ds/min), 61,789,542 states left on queue.
+Progress(49) at 2024-11-07 10:26:15: 33,073,888,592 states generated (28,827,464 s/min), 2,345,475,829 distinct states found (1,357,314 ds/min), 61,253,128 states left on queue.
+Progress(50) at 2024-11-07 10:27:15: 33,102,491,050 states generated (28,602,458 s/min), 2,346,652,625 distinct states found (1,176,796 ds/min), 60,570,177 states left on queue.
+Progress(50) at 2024-11-07 10:28:15: 33,131,166,035 states generated (28,674,985 s/min), 2,347,941,873 distinct states found (1,289,248 ds/min), 59,969,815 states left on queue.
+Progress(50) at 2024-11-07 10:29:15: 33,160,270,838 states generated (29,104,803 s/min), 2,349,441,004 distinct states found (1,499,131 ds/min), 59,570,847 states left on queue.
+Progress(50) at 2024-11-07 10:30:15: 33,189,149,869 states generated (28,879,031 s/min), 2,350,812,706 distinct states found (1,371,702 ds/min), 59,068,202 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 10:31:16)
+Progress(50) at 2024-11-07 10:31:16: 33,218,286,121 states generated (29,136,252 s/min), 2,352,357,375 distinct states found (1,544,669 ds/min), 58,692,343 states left on queue.
+Progress(50) at 2024-11-07 10:32:16: 33,246,927,616 states generated (28,641,495 s/min), 2,353,796,993 distinct states found (1,439,618 ds/min), 58,245,674 states left on queue.
+Progress(50) at 2024-11-07 10:33:16: 33,275,692,609 states generated (28,764,993 s/min), 2,355,282,278 distinct states found (1,485,285 ds/min), 57,825,713 states left on queue.
+Progress(50) at 2024-11-07 10:34:16: 33,304,267,545 states generated (28,574,936 s/min), 2,356,681,270 distinct states found (1,398,992 ds/min), 57,325,849 states left on queue.
+Progress(50) at 2024-11-07 10:35:16: 33,332,888,163 states generated (28,620,618 s/min), 2,358,099,683 distinct states found (1,418,413 ds/min), 56,833,993 states left on queue.
+Progress(50) at 2024-11-07 10:36:16: 33,361,236,042 states generated (28,347,879 s/min), 2,359,281,358 distinct states found (1,181,675 ds/min), 56,126,890 states left on queue.
+Progress(50) at 2024-11-07 10:37:16: 33,390,140,655 states generated (28,904,613 s/min), 2,360,868,517 distinct states found (1,587,159 ds/min), 55,791,859 states left on queue.
+Progress(50) at 2024-11-07 10:38:16: 33,418,998,816 states generated (28,858,161 s/min), 2,362,363,780 distinct states found (1,495,263 ds/min), 55,385,255 states left on queue.
+Progress(50) at 2024-11-07 10:39:16: 33,447,612,810 states generated (28,613,994 s/min), 2,363,728,858 distinct states found (1,365,078 ds/min), 54,854,942 states left on queue.
+Progress(50) at 2024-11-07 10:40:16: 33,476,162,070 states generated (28,549,260 s/min), 2,365,099,267 distinct states found (1,370,409 ds/min), 54,312,039 states left on queue.
+Progress(50) at 2024-11-07 10:41:16: 33,504,811,505 states generated (28,649,435 s/min), 2,366,473,549 distinct states found (1,374,282 ds/min), 53,784,809 states left on queue.
+Progress(50) at 2024-11-07 10:42:16: 33,533,403,252 states generated (28,591,747 s/min), 2,367,734,253 distinct states found (1,260,704 ds/min), 53,158,819 states left on queue.
+Progress(50) at 2024-11-07 10:43:16: 33,561,952,889 states generated (28,549,637 s/min), 2,368,855,124 distinct states found (1,120,871 ds/min), 52,441,471 states left on queue.
+Progress(50) at 2024-11-07 10:44:16: 33,590,825,690 states generated (28,872,801 s/min), 2,370,054,403 distinct states found (1,199,279 ds/min), 51,878,202 states left on queue.
+Progress(50) at 2024-11-07 10:45:16: 33,619,895,477 states generated (29,069,787 s/min), 2,371,355,035 distinct states found (1,300,632 ds/min), 51,382,836 states left on queue.
+Progress(50) at 2024-11-07 10:46:16: 33,648,391,719 states generated (28,496,242 s/min), 2,372,441,699 distinct states found (1,086,664 ds/min), 50,647,071 states left on queue.
+Progress(50) at 2024-11-07 10:47:16: 33,677,074,147 states generated (28,682,428 s/min), 2,373,600,507 distinct states found (1,158,808 ds/min), 50,052,421 states left on queue.
+Progress(50) at 2024-11-07 10:48:16: 33,705,980,713 states generated (28,906,566 s/min), 2,375,050,402 distinct states found (1,449,895 ds/min), 49,692,912 states left on queue.
+Progress(50) at 2024-11-07 10:49:16: 33,734,700,309 states generated (28,719,596 s/min), 2,376,355,805 distinct states found (1,305,403 ds/min), 49,202,990 states left on queue.
+Progress(50) at 2024-11-07 10:50:16: 33,763,294,505 states generated (28,594,196 s/min), 2,377,489,014 distinct states found (1,133,209 ds/min), 48,526,991 states left on queue.
+Progress(50) at 2024-11-07 10:51:16: 33,791,781,835 states generated (28,487,330 s/min), 2,378,610,114 distinct states found (1,121,100 ds/min), 47,806,234 states left on queue.
+Progress(50) at 2024-11-07 10:52:16: 33,820,496,936 states generated (28,715,101 s/min), 2,379,861,294 distinct states found (1,251,180 ds/min), 47,194,112 states left on queue.
+Progress(50) at 2024-11-07 10:53:16: 33,848,955,580 states generated (28,458,644 s/min), 2,381,018,247 distinct states found (1,156,953 ds/min), 46,544,595 states left on queue.
+Progress(50) at 2024-11-07 10:54:16: 33,877,358,985 states generated (28,403,405 s/min), 2,382,084,162 distinct states found (1,065,915 ds/min), 45,797,353 states left on queue.
+Progress(50) at 2024-11-07 10:55:16: 33,905,938,026 states generated (28,579,041 s/min), 2,383,237,725 distinct states found (1,153,563 ds/min), 45,079,182 states left on queue.
+Progress(50) at 2024-11-07 10:56:16: 33,934,925,952 states generated (28,987,926 s/min), 2,384,648,770 distinct states found (1,411,045 ds/min), 44,602,865 states left on queue.
+Progress(50) at 2024-11-07 10:57:16: 33,963,625,658 states generated (28,699,706 s/min), 2,385,892,826 distinct states found (1,244,056 ds/min), 44,000,281 states left on queue.
+Progress(50) at 2024-11-07 10:58:16: 33,992,548,128 states generated (28,922,470 s/min), 2,387,290,030 distinct states found (1,397,204 ds/min), 43,514,140 states left on queue.
+Progress(51) at 2024-11-07 10:59:16: 34,021,202,960 states generated (28,654,832 s/min), 2,388,511,227 distinct states found (1,221,197 ds/min), 42,867,785 states left on queue.
+Progress(51) at 2024-11-07 11:00:16: 34,049,640,853 states generated (28,437,893 s/min), 2,389,565,989 distinct states found (1,054,762 ds/min), 42,084,713 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 11:01:17)
+Progress(51) at 2024-11-07 11:01:17: 34,079,102,421 states generated (29,461,568 s/min), 2,391,039,395 distinct states found (1,473,406 ds/min), 41,644,463 states left on queue.
+Progress(51) at 2024-11-07 11:02:17: 34,107,932,294 states generated (28,829,873 s/min), 2,392,415,920 distinct states found (1,376,525 ds/min), 41,153,999 states left on queue.
+Progress(51) at 2024-11-07 11:03:17: 34,136,619,823 states generated (28,687,529 s/min), 2,393,784,341 distinct states found (1,368,421 ds/min), 40,648,398 states left on queue.
+Progress(51) at 2024-11-07 11:04:17: 34,165,416,573 states generated (28,796,750 s/min), 2,395,186,568 distinct states found (1,402,227 ds/min), 40,162,223 states left on queue.
+Progress(51) at 2024-11-07 11:05:17: 34,193,934,145 states generated (28,517,572 s/min), 2,396,461,207 distinct states found (1,274,639 ds/min), 39,558,749 states left on queue.
+Progress(51) at 2024-11-07 11:06:17: 34,222,437,146 states generated (28,503,001 s/min), 2,397,667,005 distinct states found (1,205,798 ds/min), 38,877,170 states left on queue.
+Progress(51) at 2024-11-07 11:07:17: 34,251,162,633 states generated (28,725,487 s/min), 2,399,047,586 distinct states found (1,380,581 ds/min), 38,366,536 states left on queue.
+Progress(51) at 2024-11-07 11:08:17: 34,280,005,309 states generated (28,842,676 s/min), 2,400,476,715 distinct states found (1,429,129 ds/min), 37,912,093 states left on queue.
+Progress(51) at 2024-11-07 11:09:17: 34,308,388,681 states generated (28,383,372 s/min), 2,401,648,509 distinct states found (1,171,794 ds/min), 37,215,479 states left on queue.
+Progress(51) at 2024-11-07 11:10:17: 34,337,086,557 states generated (28,697,876 s/min), 2,403,035,913 distinct states found (1,387,404 ds/min), 36,712,331 states left on queue.
+Progress(51) at 2024-11-07 11:11:17: 34,365,565,315 states generated (28,478,758 s/min), 2,404,187,792 distinct states found (1,151,879 ds/min), 36,008,223 states left on queue.
+Progress(51) at 2024-11-07 11:12:17: 34,394,280,845 states generated (28,715,530 s/min), 2,405,264,161 distinct states found (1,076,369 ds/min), 35,318,651 states left on queue.
+Progress(51) at 2024-11-07 11:13:17: 34,423,292,173 states generated (29,011,328 s/min), 2,406,461,030 distinct states found (1,196,869 ds/min), 34,731,310 states left on queue.
+Progress(51) at 2024-11-07 11:14:17: 34,451,717,631 states generated (28,425,458 s/min), 2,407,470,263 distinct states found (1,009,233 ds/min), 33,977,845 states left on queue.
+Progress(51) at 2024-11-07 11:15:17: 34,480,582,848 states generated (28,865,217 s/min), 2,408,844,472 distinct states found (1,374,209 ds/min), 33,563,385 states left on queue.
+Progress(51) at 2024-11-07 11:16:17: 34,509,255,375 states generated (28,672,527 s/min), 2,409,992,223 distinct states found (1,147,751 ds/min), 32,948,371 states left on queue.
+Progress(51) at 2024-11-07 11:17:17: 34,537,627,156 states generated (28,371,781 s/min), 2,411,007,744 distinct states found (1,015,521 ds/min), 32,138,450 states left on queue.
+Progress(51) at 2024-11-07 11:18:17: 34,566,104,650 states generated (28,477,494 s/min), 2,412,094,834 distinct states found (1,087,090 ds/min), 31,405,790 states left on queue.
+Progress(51) at 2024-11-07 11:19:17: 34,594,468,421 states generated (28,363,771 s/min), 2,413,136,514 distinct states found (1,041,680 ds/min), 30,631,648 states left on queue.
+Progress(51) at 2024-11-07 11:20:17: 34,623,282,746 states generated (28,814,325 s/min), 2,414,376,756 distinct states found (1,240,242 ds/min), 30,011,457 states left on queue.
+Progress(51) at 2024-11-07 11:21:17: 34,652,013,328 states generated (28,730,582 s/min), 2,415,631,977 distinct states found (1,255,221 ds/min), 29,420,035 states left on queue.
+Progress(51) at 2024-11-07 11:22:17: 34,680,708,001 states generated (28,694,673 s/min), 2,416,841,149 distinct states found (1,209,172 ds/min), 28,780,239 states left on queue.
+Progress(52) at 2024-11-07 11:23:17: 34,709,197,697 states generated (28,489,696 s/min), 2,417,931,157 distinct states found (1,090,008 ds/min), 28,033,256 states left on queue.
+Progress(52) at 2024-11-07 11:24:17: 34,738,057,742 states generated (28,860,045 s/min), 2,419,214,866 distinct states found (1,283,709 ds/min), 27,476,210 states left on queue.
+Progress(52) at 2024-11-07 11:25:17: 34,766,795,719 states generated (28,737,977 s/min), 2,420,575,203 distinct states found (1,360,337 ds/min), 26,973,510 states left on queue.
+Progress(52) at 2024-11-07 11:26:17: 34,795,409,801 states generated (28,614,082 s/min), 2,421,852,170 distinct states found (1,276,967 ds/min), 26,383,152 states left on queue.
+Progress(52) at 2024-11-07 11:27:17: 34,823,871,413 states generated (28,461,612 s/min), 2,423,018,118 distinct states found (1,165,948 ds/min), 25,687,358 states left on queue.
+Progress(52) at 2024-11-07 11:28:17: 34,852,452,267 states generated (28,580,854 s/min), 2,424,258,491 distinct states found (1,240,373 ds/min), 25,061,677 states left on queue.
+Progress(52) at 2024-11-07 11:29:17: 34,881,109,110 states generated (28,656,843 s/min), 2,425,536,450 distinct states found (1,277,959 ds/min), 24,485,682 states left on queue.
+Progress(52) at 2024-11-07 11:30:17: 34,909,638,357 states generated (28,529,247 s/min), 2,426,766,241 distinct states found (1,229,791 ds/min), 23,851,800 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 11:31:18)
+Progress(52) at 2024-11-07 11:31:18: 34,938,217,205 states generated (28,578,848 s/min), 2,427,804,784 distinct states found (1,038,543 ds/min), 23,061,400 states left on queue.
+Progress(52) at 2024-11-07 11:32:18: 34,967,089,391 states generated (28,872,186 s/min), 2,428,907,251 distinct states found (1,102,467 ds/min), 22,421,037 states left on queue.
+Progress(52) at 2024-11-07 11:33:18: 34,995,531,710 states generated (28,442,319 s/min), 2,429,963,142 distinct states found (1,055,891 ds/min), 21,740,235 states left on queue.
+Progress(52) at 2024-11-07 11:34:18: 35,024,141,172 states generated (28,609,462 s/min), 2,431,122,150 distinct states found (1,159,008 ds/min), 21,149,288 states left on queue.
+Progress(52) at 2024-11-07 11:35:18: 35,052,351,960 states generated (28,210,788 s/min), 2,432,077,858 distinct states found (955,708 ds/min), 20,295,072 states left on queue.
+Progress(52) at 2024-11-07 11:36:18: 35,080,654,028 states generated (28,302,068 s/min), 2,433,061,991 distinct states found (984,133 ds/min), 19,478,746 states left on queue.
+Progress(52) at 2024-11-07 11:37:18: 35,109,293,099 states generated (28,639,071 s/min), 2,434,258,110 distinct states found (1,196,119 ds/min), 18,850,062 states left on queue.
+Progress(53) at 2024-11-07 11:38:18: 35,137,874,307 states generated (28,581,208 s/min), 2,435,408,538 distinct states found (1,150,428 ds/min), 18,171,042 states left on queue.
+Progress(53) at 2024-11-07 11:39:18: 35,166,493,712 states generated (28,619,405 s/min), 2,436,567,034 distinct states found (1,158,496 ds/min), 17,510,811 states left on queue.
+Progress(53) at 2024-11-07 11:40:18: 35,195,076,188 states generated (28,582,476 s/min), 2,437,810,887 distinct states found (1,243,853 ds/min), 16,916,098 states left on queue.
+Progress(53) at 2024-11-07 11:41:18: 35,223,492,769 states generated (28,416,581 s/min), 2,438,939,934 distinct states found (1,129,047 ds/min), 16,200,301 states left on queue.
+Progress(53) at 2024-11-07 11:42:18: 35,252,026,035 states generated (28,533,266 s/min), 2,440,130,151 distinct states found (1,190,217 ds/min), 15,545,447 states left on queue.
+Progress(53) at 2024-11-07 11:43:18: 35,280,482,465 states generated (28,456,430 s/min), 2,441,297,027 distinct states found (1,166,876 ds/min), 14,879,990 states left on queue.
+Progress(53) at 2024-11-07 11:44:18: 35,308,940,796 states generated (28,458,331 s/min), 2,442,317,453 distinct states found (1,020,426 ds/min), 14,116,803 states left on queue.
+Progress(53) at 2024-11-07 11:45:18: 35,337,597,306 states generated (28,656,510 s/min), 2,443,328,791 distinct states found (1,011,338 ds/min), 13,403,307 states left on queue.
+Progress(53) at 2024-11-07 11:46:18: 35,366,058,165 states generated (28,460,859 s/min), 2,444,336,498 distinct states found (1,007,707 ds/min), 12,657,418 states left on queue.
+Progress(53) at 2024-11-07 11:47:18: 35,394,499,327 states generated (28,441,162 s/min), 2,445,346,072 distinct states found (1,009,574 ds/min), 11,856,670 states left on queue.
+Progress(53) at 2024-11-07 11:48:18: 35,423,058,448 states generated (28,559,121 s/min), 2,446,449,527 distinct states found (1,103,455 ds/min), 11,150,850 states left on queue.
+Progress(54) at 2024-11-07 11:49:18: 35,451,714,950 states generated (28,656,502 s/min), 2,447,608,246 distinct states found (1,158,719 ds/min), 10,497,489 states left on queue.
+Progress(54) at 2024-11-07 11:50:18: 35,480,075,027 states generated (28,360,077 s/min), 2,448,668,413 distinct states found (1,060,167 ds/min), 9,734,924 states left on queue.
+Progress(54) at 2024-11-07 11:51:18: 35,508,544,241 states generated (28,469,214 s/min), 2,449,793,995 distinct states found (1,125,582 ds/min), 9,041,108 states left on queue.
+Progress(54) at 2024-11-07 11:52:18: 35,537,058,894 states generated (28,514,653 s/min), 2,450,835,560 distinct states found (1,041,565 ds/min), 8,304,357 states left on queue.
+Progress(54) at 2024-11-07 11:53:18: 35,565,617,770 states generated (28,558,876 s/min), 2,451,805,307 distinct states found (969,747 ds/min), 7,554,593 states left on queue.
+Progress(54) at 2024-11-07 11:54:18: 35,594,096,319 states generated (28,478,549 s/min), 2,452,829,286 distinct states found (1,023,979 ds/min), 6,777,854 states left on queue.
+Progress(55) at 2024-11-07 11:55:18: 35,622,658,049 states generated (28,561,730 s/min), 2,453,911,213 distinct states found (1,081,927 ds/min), 6,063,348 states left on queue.
+Progress(55) at 2024-11-07 11:56:18: 35,651,019,108 states generated (28,361,059 s/min), 2,454,944,844 distinct states found (1,033,631 ds/min), 5,290,297 states left on queue.
+Progress(55) at 2024-11-07 11:57:18: 35,679,577,103 states generated (28,557,995 s/min), 2,455,941,484 distinct states found (996,640 ds/min), 4,540,257 states left on queue.
+Progress(55) at 2024-11-07 11:58:18: 35,708,050,230 states generated (28,473,127 s/min), 2,456,911,566 distinct states found (970,082 ds/min), 3,737,722 states left on queue.
+Progress(55) at 2024-11-07 11:59:18: 35,736,484,911 states generated (28,434,681 s/min), 2,457,942,176 distinct states found (1,030,610 ds/min), 2,980,348 states left on queue.
+Progress(56) at 2024-11-07 12:00:18: 35,765,029,620 states generated (28,544,709 s/min), 2,458,911,346 distinct states found (969,170 ds/min), 2,201,353 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 12:01:18)
+Progress(57) at 2024-11-07 12:01:18: 35,793,733,161 states generated (28,703,541 s/min), 2,459,897,228 distinct states found (985,882 ds/min), 1,411,705 states left on queue.
+Progress(58) at 2024-11-07 12:02:18: 35,822,110,432 states generated (28,377,271 s/min), 2,460,820,961 distinct states found (923,733 ds/min), 587,430 states left on queue.
+Model checking completed. No error has been found.
+  Estimates of the probability that TLC did not check all reachable states
+  because two distinct states had the same fingerprint:
+  calculated (optimistic):  val = 4.5
+  based on the actual fingerprints:  val = .25
+35840434685 states generated, 2461362509 distinct states found, 0 states left on queue.
+The depth of the complete state graph search is 67.
+The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 8 and the 95th percentile is 2).
+Finished in 20h 32min at (2024-11-07 12:03:02)
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log
new file mode 100644
index 0000000000..c43d52302b
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log
@@ -0,0 +1,89 @@
+git revision: 864f4667d
+Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
+CPU Info Linux: Neoverse-N1
+CPU Cores Linux: 80
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorStatic.tla
+Config: models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg
+----
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3, a4, a5}
+max_term = 2
+max_entries = 2
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
+Running breadth-first search Model-Checking with fp 90 and seed 2164066158568118414 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 30788] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-13824636513165485309/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-13824636513165485309/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-13824636513165485309/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-13824636513165485309/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-13824636513165485309/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-13824636513165485309/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /tmp/tlc-13824636513165485309/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorStatic
+Starting... (2024-11-06 12:09:33)
+Computing initial states...
+Finished computing initial states: 1 distinct state generated at 2024-11-06 12:09:36.
+Progress(16) at 2024-11-06 12:09:39: 405,675 states generated (405,675 s/min), 18,042 distinct states found (18,042 ds/min), 7,612 states left on queue.
+Progress(23) at 2024-11-06 12:10:39: 12,449,257 states generated (12,043,582 s/min), 467,293 distinct states found (449,251 ds/min), 161,057 states left on queue.
+Progress(25) at 2024-11-06 12:11:39: 24,461,332 states generated (12,012,075 s/min), 861,011 distinct states found (393,718 ds/min), 267,072 states left on queue.
+Progress(26) at 2024-11-06 12:12:39: 36,440,377 states generated (11,979,045 s/min), 1,234,052 distinct states found (373,041 ds/min), 355,372 states left on queue.
+Progress(26) at 2024-11-06 12:13:39: 48,327,873 states generated (11,887,496 s/min), 1,583,736 distinct states found (349,684 ds/min), 425,209 states left on queue.
+Progress(27) at 2024-11-06 12:14:39: 60,246,136 states generated (11,918,263 s/min), 1,933,499 distinct states found (349,763 ds/min), 494,269 states left on queue.
+Progress(28) at 2024-11-06 12:15:39: 71,977,716 states generated (11,731,580 s/min), 2,265,302 distinct states found (331,803 ds/min), 553,777 states left on queue.
+Progress(28) at 2024-11-06 12:16:39: 83,644,537 states generated (11,666,821 s/min), 2,575,451 distinct states found (310,149 ds/min), 594,142 states left on queue.
+Progress(29) at 2024-11-06 12:17:39: 95,287,089 states generated (11,642,552 s/min), 2,888,793 distinct states found (313,342 ds/min), 639,273 states left on queue.
+Progress(29) at 2024-11-06 12:18:39: 107,000,972 states generated (11,713,883 s/min), 3,194,255 distinct states found (305,462 ds/min), 673,353 states left on queue.
+Progress(29) at 2024-11-06 12:19:39: 118,305,248 states generated (11,304,276 s/min), 3,467,775 distinct states found (273,520 ds/min), 692,915 states left on queue.
+Progress(29) at 2024-11-06 12:20:39: 129,954,327 states generated (11,649,079 s/min), 3,763,186 distinct states found (295,411 ds/min), 720,349 states left on queue.
+Progress(29) at 2024-11-06 12:21:39: 141,251,359 states generated (11,297,032 s/min), 4,020,407 distinct states found (257,221 ds/min), 724,036 states left on queue.
+Progress(30) at 2024-11-06 12:22:39: 152,551,873 states generated (11,300,514 s/min), 4,284,278 distinct states found (263,871 ds/min), 733,726 states left on queue.
+Progress(30) at 2024-11-06 12:23:39: 164,324,788 states generated (11,772,915 s/min), 4,569,569 distinct states found (285,291 ds/min), 746,476 states left on queue.
+Progress(30) at 2024-11-06 12:24:39: 175,121,317 states generated (10,796,529 s/min), 4,779,505 distinct states found (209,936 ds/min), 723,070 states left on queue.
+Progress(31) at 2024-11-06 12:25:39: 186,238,236 states generated (11,116,919 s/min), 5,016,034 distinct states found (236,529 ds/min), 712,944 states left on queue.
+Progress(31) at 2024-11-06 12:26:39: 197,884,578 states generated (11,646,342 s/min), 5,276,094 distinct states found (260,060 ds/min), 705,471 states left on queue.
+Progress(31) at 2024-11-06 12:27:39: 208,535,096 states generated (10,650,518 s/min), 5,463,450 distinct states found (187,356 ds/min), 665,661 states left on queue.
+Progress(32) at 2024-11-06 12:28:39: 219,424,829 states generated (10,889,733 s/min), 5,673,673 distinct states found (210,223 ds/min), 637,975 states left on queue.
+Progress(32) at 2024-11-06 12:29:39: 230,906,372 states generated (11,481,543 s/min), 5,903,516 distinct states found (229,843 ds/min), 606,255 states left on queue.
+Progress(33) at 2024-11-06 12:30:39: 241,261,887 states generated (10,355,515 s/min), 6,065,731 distinct states found (162,215 ds/min), 552,728 states left on queue.
+Progress(33) at 2024-11-06 12:31:39: 252,028,921 states generated (10,767,034 s/min), 6,255,487 distinct states found (189,756 ds/min), 509,620 states left on queue.
+Progress(33) at 2024-11-06 12:32:39: 262,856,171 states generated (10,827,250 s/min), 6,431,063 distinct states found (175,576 ds/min), 448,834 states left on queue.
+Progress(34) at 2024-11-06 12:33:39: 273,211,882 states generated (10,355,711 s/min), 6,586,644 distinct states found (155,581 ds/min), 386,905 states left on queue.
+Progress(34) at 2024-11-06 12:34:39: 283,843,415 states generated (10,631,533 s/min), 6,743,916 distinct states found (157,272 ds/min), 315,135 states left on queue.
+Progress(35) at 2024-11-06 12:35:39: 293,931,115 states generated (10,087,700 s/min), 6,878,405 distinct states found (134,489 ds/min), 241,126 states left on queue.
+Progress(36) at 2024-11-06 12:36:39: 303,903,441 states generated (9,972,326 s/min), 6,996,394 distinct states found (117,989 ds/min), 152,775 states left on queue.
+Progress(37) at 2024-11-06 12:37:39: 313,501,886 states generated (9,598,445 s/min), 7,093,031 distinct states found (96,637 ds/min), 54,009 states left on queue.
+Model checking completed. No error has been found.
+  Estimates of the probability that TLC did not check all reachable states
+  because two distinct states had the same fingerprint:
+  calculated (optimistic):  val = 1.2E-4
+  based on the actual fingerprints:  val = 2.1E-6
+318172398 states generated, 7127950 distinct states found, 0 states left on queue.
+The depth of the complete state graph search is 44.
+The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 9 and the 95th percentile is 3).
+Finished in 28min 43s at (2024-11-06 12:38:16)

From a0cd64c4d3b1cf9738f8ec1b3a5bf433e42cc03d Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 2 Dec 2024 18:24:48 +0100
Subject: [PATCH 159/209] Bump OTel, tracing, reqwest crates (#9970)

---
 Cargo.lock | 158 ++++++++++++++++++++++++++---------------------------
 Cargo.toml |  16 +++---
 2 files changed, 86 insertions(+), 88 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5ce27a7d45..ba02e3b11d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -185,7 +185,7 @@ checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
  "synstructure",
 ]
 
@@ -197,7 +197,7 @@ checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -256,7 +256,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -267,7 +267,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -969,7 +969,7 @@ dependencies = [
  "regex",
  "rustc-hash",
  "shlex",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1198,7 +1198,7 @@ dependencies = [
  "heck 0.4.1",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1615,7 +1615,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1626,7 +1626,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1749,7 +1749,7 @@ dependencies = [
  "dsl_auto_type",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1769,7 +1769,7 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25"
 dependencies = [
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1792,7 +1792,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1815,7 +1815,7 @@ dependencies = [
  "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1947,7 +1947,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1980,7 +1980,7 @@ checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2234,7 +2234,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2337,7 +2337,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -3142,7 +3142,7 @@ dependencies = [
  "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -3515,9 +3515,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "opentelemetry"
-version = "0.24.0"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c365a63eec4f55b7efeceb724f1336f26a9cf3427b70e59e2cd2a5b947fba96"
+checksum = "570074cc999d1a58184080966e5bd3bf3a9a4af650c3b05047c2621e7405cd17"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -3529,9 +3529,9 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-http"
-version = "0.13.0"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad31e9de44ee3538fb9d64fe3376c1362f406162434609e79aea2a41a0af78ab"
+checksum = "6351496aeaa49d7c267fb480678d85d1cd30c5edb20b497c48c56f62a8c14b99"
 dependencies = [
  "async-trait",
  "bytes",
@@ -3542,9 +3542,9 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-otlp"
-version = "0.17.0"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b925a602ffb916fb7421276b86756027b37ee708f9dce2dbdcc51739f07e727"
+checksum = "29e1f9c8b032d4f635c730c0efcf731d5e2530ea13fa8bef7939ddc8420696bd"
 dependencies = [
  "async-trait",
  "futures-core",
@@ -3560,9 +3560,9 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-proto"
-version = "0.7.0"
+version = "0.26.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30ee9f20bff9c984511a02f082dc8ede839e4a9bf15cc2487c8d6fea5ad850d9"
+checksum = "c9d3968ce3aefdcca5c27e3c4ea4391b37547726a70893aab52d3de95d5f8b34"
 dependencies = [
  "opentelemetry",
  "opentelemetry_sdk",
@@ -3572,15 +3572,15 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-semantic-conventions"
-version = "0.16.0"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1cefe0543875379e47eb5f1e68ff83f45cc41366a92dfd0d073d513bf68e9a05"
+checksum = "db945c1eaea8ac6a9677185357480d215bb6999faa9f691d0c4d4d641eab7a09"
 
 [[package]]
 name = "opentelemetry_sdk"
-version = "0.24.1"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "692eac490ec80f24a17828d49b40b60f5aeaccdfe6a503f939713afd22bc28df"
+checksum = "d2c627d9f4c9cdc1f21a29ee4bfbd6028fcb8bcf2a857b43f3abdf72c9c862f3"
 dependencies = [
  "async-trait",
  "futures-channel",
@@ -3954,7 +3954,7 @@ dependencies = [
  "parquet",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4056,7 +4056,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4334,7 +4334,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7"
 dependencies = [
  "proc-macro2",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4348,9 +4348,9 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.78"
+version = "1.0.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
+checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
 dependencies = [
  "unicode-ident",
 ]
@@ -4424,7 +4424,7 @@ dependencies = [
  "prost",
  "prost-types",
  "regex",
- "syn 2.0.52",
+ "syn 2.0.90",
  "tempfile",
 ]
 
@@ -4438,7 +4438,7 @@ dependencies = [
  "itertools 0.12.1",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4992,9 +4992,9 @@ dependencies = [
 
 [[package]]
 name = "reqwest-middleware"
-version = "0.3.0"
+version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0209efb52486ad88136190094ee214759ef7507068b27992256ed6610eb71a01"
+checksum = "d1ccd3b55e711f91a9885a2fa6fbbb2e39db1776420b062efc058c6410f7e5e3"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -5007,13 +5007,12 @@ dependencies = [
 
 [[package]]
 name = "reqwest-retry"
-version = "0.5.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40f342894422862af74c50e1e9601cf0931accc9c6981e5eb413c46603b616b5"
+checksum = "29c73e4195a6bfbcb174b790d9b3407ab90646976c55de58a6515da25d851178"
 dependencies = [
  "anyhow",
  "async-trait",
- "chrono",
  "futures",
  "getrandom 0.2.11",
  "http 1.1.0",
@@ -5022,6 +5021,7 @@ dependencies = [
  "reqwest 0.12.4",
  "reqwest-middleware",
  "retry-policies",
+ "thiserror",
  "tokio",
  "tracing",
  "wasm-timer",
@@ -5029,9 +5029,9 @@ dependencies = [
 
 [[package]]
 name = "reqwest-tracing"
-version = "0.5.3"
+version = "0.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfdd9bfa64c72233d8dd99ab7883efcdefe9e16d46488ecb9228b71a2e2ceb45"
+checksum = "ff82cf5730a1311fb9413b0bc2b8e743e0157cd73f010ab4ec374a923873b6a2"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -5047,12 +5047,10 @@ dependencies = [
 
 [[package]]
 name = "retry-policies"
-version = "0.3.0"
+version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "493b4243e32d6eedd29f9a398896e35c6943a123b55eec97dcaee98310d25810"
+checksum = "5875471e6cab2871bc150ecb8c727db5113c9338cc3354dc5ee3425b6aa40a1c"
 dependencies = [
- "anyhow",
- "chrono",
  "rand 0.8.5",
 ]
 
@@ -5176,7 +5174,7 @@ dependencies = [
  "regex",
  "relative-path",
  "rustc_version",
- "syn 2.0.52",
+ "syn 2.0.90",
  "unicode-ident",
 ]
 
@@ -5684,7 +5682,7 @@ checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -5766,7 +5764,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6139,7 +6137,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6190,9 +6188,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.52"
+version = "2.0.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
+checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6222,7 +6220,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6300,27 +6298,27 @@ checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "thiserror"
-version = "1.0.57"
+version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.57"
+version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6494,7 +6492,7 @@ checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6719,7 +6717,7 @@ dependencies = [
  "prost-build",
  "prost-types",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6756,9 +6754,9 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
 
 [[package]]
 name = "tracing"
-version = "0.1.40"
+version = "0.1.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
 dependencies = [
  "log",
  "pin-project-lite",
@@ -6779,20 +6777,20 @@ dependencies = [
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.27"
+version = "0.1.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "tracing-core"
-version = "0.1.32"
+version = "0.1.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
 dependencies = [
  "once_cell",
  "valuable",
@@ -6821,9 +6819,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-opentelemetry"
-version = "0.25.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9784ed4da7d921bc8df6963f8c80a0e4ce34ba6ba76668acadd3edbd985ff3b"
+checksum = "dc58af5d3f6c5811462cabb3289aec0093f7338e367e5a33d28c0433b3c7360b"
 dependencies = [
  "js-sys",
  "once_cell",
@@ -6839,9 +6837,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-serde"
-version = "0.1.3"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1"
+checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1"
 dependencies = [
  "serde",
  "tracing-core",
@@ -6849,9 +6847,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-subscriber"
-version = "0.3.18"
+version = "0.3.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
+checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
 dependencies = [
  "matchers",
  "once_cell",
@@ -7258,7 +7256,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
  "wasm-bindgen-shared",
 ]
 
@@ -7292,7 +7290,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -7669,7 +7667,7 @@ dependencies = [
  "smallvec",
  "spki 0.7.3",
  "subtle",
- "syn 2.0.52",
+ "syn 2.0.90",
  "sync_wrapper 0.1.2",
  "tikv-jemalloc-sys",
  "time",
@@ -7769,7 +7767,7 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -7790,7 +7788,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 64c384f17a..036dc01057 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -127,10 +127,10 @@ notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
-opentelemetry = "0.24"
-opentelemetry_sdk = "0.24"
-opentelemetry-otlp = { version = "0.17", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.16"
+opentelemetry = "0.26"
+opentelemetry_sdk = "0.26"
+opentelemetry-otlp = { version = "0.26", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.26"
 parking_lot = "0.12"
 parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
@@ -144,9 +144,9 @@ rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_24"] }
-reqwest-middleware = "0.3.0"
-reqwest-retry = "0.5"
+reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_26"] }
+reqwest-middleware = "0.4"
+reqwest-retry = "0.7"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
@@ -191,7 +191,7 @@ tonic = {version = "0.12.3", features = ["tls", "tls-roots"]}
 tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2"
-tracing-opentelemetry = "0.25"
+tracing-opentelemetry = "0.27"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }

From 32ba9811f979f0ae31ee489202304becd728d511 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 2 Dec 2024 17:54:32 +0000
Subject: [PATCH 160/209] feat(proxy): emit JWT auth method and JWT issuer in
 parquet logs (#9971)

Fix the HTTP AuthMethod to accomodate the JWT authorization method.
Introduces the JWT issuer as an additional field in the parquet logs
---
 proxy/src/auth/backend/jwt.rs         | 10 +++--
 proxy/src/context/mod.rs              |  9 +++++
 proxy/src/context/parquet.rs          | 53 +++++++++++++++------------
 proxy/src/serverless/backend.rs       |  4 ++
 proxy/src/serverless/sql_over_http.rs |  3 --
 5 files changed, 49 insertions(+), 30 deletions(-)

diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 517d4fd34b..a258090b15 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -350,6 +350,13 @@ impl JwkCacheEntryLock {
         let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)?;
         let header = serde_json::from_slice::<JwtHeader<'_>>(&header)?;
 
+        let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?;
+        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payloadb)?;
+
+        if let Some(iss) = &payload.issuer {
+            ctx.set_jwt_issuer(iss.as_ref().to_owned());
+        }
+
         let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)?;
 
         let kid = header.key_id.ok_or(JwtError::MissingKeyId)?;
@@ -388,9 +395,6 @@ impl JwkCacheEntryLock {
             key => return Err(JwtError::UnsupportedKeyType(key.into())),
         };
 
-        let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?;
-        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payloadb)?;
-
         tracing::debug!(?payload, "JWT signature valid with claims");
 
         if let Some(aud) = expected_audience {
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index 4a063a5faa..a9fb513d3c 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -57,6 +57,7 @@ struct RequestContextInner {
     application: Option<SmolStr>,
     error_kind: Option<ErrorKind>,
     pub(crate) auth_method: Option<AuthMethod>,
+    jwt_issuer: Option<String>,
     success: bool,
     pub(crate) cold_start_info: ColdStartInfo,
     pg_options: Option<StartupMessageParams>,
@@ -79,6 +80,7 @@ pub(crate) enum AuthMethod {
     ScramSha256,
     ScramSha256Plus,
     Cleartext,
+    Jwt,
 }
 
 impl Clone for RequestContext {
@@ -100,6 +102,7 @@ impl Clone for RequestContext {
             application: inner.application.clone(),
             error_kind: inner.error_kind,
             auth_method: inner.auth_method.clone(),
+            jwt_issuer: inner.jwt_issuer.clone(),
             success: inner.success,
             rejected: inner.rejected,
             cold_start_info: inner.cold_start_info,
@@ -148,6 +151,7 @@ impl RequestContext {
             application: None,
             error_kind: None,
             auth_method: None,
+            jwt_issuer: None,
             success: false,
             rejected: None,
             cold_start_info: ColdStartInfo::Unknown,
@@ -246,6 +250,11 @@ impl RequestContext {
         this.auth_method = Some(auth_method);
     }
 
+    pub(crate) fn set_jwt_issuer(&self, jwt_issuer: String) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.jwt_issuer = Some(jwt_issuer);
+    }
+
     pub fn has_private_peer_addr(&self) -> bool {
         self.0
             .try_lock()
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index b375eb886e..3105d08526 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -87,6 +87,8 @@ pub(crate) struct RequestData {
     branch: Option<String>,
     pg_options: Option<String>,
     auth_method: Option<&'static str>,
+    jwt_issuer: Option<String>,
+
     error: Option<&'static str>,
     /// Success is counted if we form a HTTP response with sql rows inside
     /// Or if we make it to proxy_pass
@@ -138,7 +140,9 @@ impl From<&RequestContextInner> for RequestData {
                 super::AuthMethod::ScramSha256 => "scram_sha_256",
                 super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus",
                 super::AuthMethod::Cleartext => "cleartext",
+                super::AuthMethod::Jwt => "jwt",
             }),
+            jwt_issuer: value.jwt_issuer.clone(),
             protocol: value.protocol.as_str(),
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
@@ -519,6 +523,7 @@ mod tests {
             branch: Some(hex::encode(rng.gen::<[u8; 16]>())),
             pg_options: None,
             auth_method: None,
+            jwt_issuer: None,
             protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
             region: "us-east-1",
             error: None,
@@ -599,15 +604,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1312632, 3, 6000),
-                (1312621, 3, 6000),
-                (1312680, 3, 6000),
-                (1312637, 3, 6000),
-                (1312773, 3, 6000),
-                (1312610, 3, 6000),
-                (1312404, 3, 6000),
-                (1312639, 3, 6000),
-                (437848, 1, 2000)
+                (1313105, 3, 6000),
+                (1313094, 3, 6000),
+                (1313153, 3, 6000),
+                (1313110, 3, 6000),
+                (1313246, 3, 6000),
+                (1313083, 3, 6000),
+                (1312877, 3, 6000),
+                (1313112, 3, 6000),
+                (438020, 1, 2000)
             ]
         );
 
@@ -639,11 +644,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1203465, 5, 10000),
-                (1203189, 5, 10000),
-                (1203490, 5, 10000),
-                (1203475, 5, 10000),
-                (1203729, 5, 10000)
+                (1204324, 5, 10000),
+                (1204048, 5, 10000),
+                (1204349, 5, 10000),
+                (1204334, 5, 10000),
+                (1204588, 5, 10000)
             ]
         );
 
@@ -668,15 +673,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1312632, 3, 6000),
-                (1312621, 3, 6000),
-                (1312680, 3, 6000),
-                (1312637, 3, 6000),
-                (1312773, 3, 6000),
-                (1312610, 3, 6000),
-                (1312404, 3, 6000),
-                (1312639, 3, 6000),
-                (437848, 1, 2000)
+                (1313105, 3, 6000),
+                (1313094, 3, 6000),
+                (1313153, 3, 6000),
+                (1313110, 3, 6000),
+                (1313246, 3, 6000),
+                (1313083, 3, 6000),
+                (1312877, 3, 6000),
+                (1313112, 3, 6000),
+                (438020, 1, 2000)
             ]
         );
 
@@ -713,7 +718,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(657696, 2, 3001), (657410, 2, 3000), (657206, 2, 2999)]
+            [(658014, 2, 3001), (657728, 2, 3000), (657524, 2, 2999)]
         );
 
         tmpdir.close().unwrap();
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 75909f3358..57846a4c2c 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -53,6 +53,8 @@ impl PoolingBackend {
         user_info: &ComputeUserInfo,
         password: &[u8],
     ) -> Result<ComputeCredentials, AuthError> {
+        ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
+
         let user_info = user_info.clone();
         let backend = self.auth_backend.as_ref().map(|()| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
@@ -115,6 +117,8 @@ impl PoolingBackend {
         user_info: &ComputeUserInfo,
         jwt: String,
     ) -> Result<ComputeCredentials, AuthError> {
+        ctx.set_auth_method(crate::context::AuthMethod::Jwt);
+
         match &self.auth_backend {
             crate::auth::Backend::ControlPlane(console, ()) => {
                 self.config
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index afd93d02f0..a0ca7cc60d 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -139,9 +139,6 @@ fn get_conn_info(
     headers: &HeaderMap,
     tls: Option<&TlsConfig>,
 ) -> Result<ConnInfoWithAuth, ConnInfoError> {
-    // HTTP only uses cleartext (for now and likely always)
-    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
-
     let connection_string = headers
         .get(&CONN_STRING)
         .ok_or(ConnInfoError::InvalidHeader(&CONN_STRING))?

From 84b48211185c818a8a788d412484035d561beaa4 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 2 Dec 2024 12:06:19 -0600
Subject: [PATCH 161/209] Stop changing the value of neon.extension_server_port
 at runtime (#9972)

On reconfigure, we no longer passed a port for the extension server
which caused us to not write out the neon.extension_server_port line.
Thus, Postgres thought we were setting the port to the default value of
0. PGC_POSTMASTER GUCs cannot be set at runtime, which causes the
following log messages:

> LOG: parameter "neon.extension_server_port" cannot be changed without
restarting the server
> LOG: configuration file
"/var/db/postgres/compute/pgdata/postgresql.conf" contains errors;
unaffected changes were applied

Fixes: https://github.com/neondatabase/neon/issues/9945

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs |  9 ++-------
 compute_tools/src/compute.rs         | 19 +++++++------------
 compute_tools/src/config.rs          |  6 ++----
 3 files changed, 11 insertions(+), 23 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index b178d7abd6..e73ccd908e 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -335,6 +335,7 @@ fn wait_spec(
         pgdata: pgdata.to_string(),
         pgbin: pgbin.to_string(),
         pgversion: get_pg_version_string(pgbin),
+        http_port,
         live_config_allowed,
         state: Mutex::new(new_state),
         state_changed: Condvar::new(),
@@ -389,7 +390,6 @@ fn wait_spec(
 
     Ok(WaitSpecResult {
         compute,
-        http_port,
         resize_swap_on_bind,
         set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(),
     })
@@ -397,8 +397,6 @@ fn wait_spec(
 
 struct WaitSpecResult {
     compute: Arc<ComputeNode>,
-    // passed through from ProcessCliResult
-    http_port: u16,
     resize_swap_on_bind: bool,
     set_disk_quota_for_fs: Option<String>,
 }
@@ -408,7 +406,6 @@ fn start_postgres(
     #[allow(unused_variables)] matches: &clap::ArgMatches,
     WaitSpecResult {
         compute,
-        http_port,
         resize_swap_on_bind,
         set_disk_quota_for_fs,
     }: WaitSpecResult,
@@ -481,12 +478,10 @@ fn start_postgres(
         }
     }
 
-    let extension_server_port: u16 = http_port;
-
     // Start Postgres
     let mut pg = None;
     if !prestartup_failed {
-        pg = match compute.start_compute(extension_server_port) {
+        pg = match compute.start_compute() {
             Ok(pg) => Some(pg),
             Err(err) => {
                 error!("could not start the compute node: {:#}", err);
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index da1caf1a9b..0d1e6d680f 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -79,6 +79,8 @@ pub struct ComputeNode {
     /// - we push spec and it does configuration
     /// - but then it is restarted without any spec again
     pub live_config_allowed: bool,
+    /// The port that the compute's HTTP server listens on
+    pub http_port: u16,
     /// Volatile part of the `ComputeNode`, which should be used under `Mutex`.
     /// To allow HTTP API server to serving status requests, while configuration
     /// is in progress, lock should be held only for short periods of time to do
@@ -611,11 +613,7 @@ impl ComputeNode {
     /// Do all the preparations like PGDATA directory creation, configuration,
     /// safekeepers sync, basebackup, etc.
     #[instrument(skip_all)]
-    pub fn prepare_pgdata(
-        &self,
-        compute_state: &ComputeState,
-        extension_server_port: u16,
-    ) -> Result<()> {
+    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
         let pspec = compute_state.pspec.as_ref().expect("spec must be set");
         let spec = &pspec.spec;
         let pgdata_path = Path::new(&self.pgdata);
@@ -625,7 +623,7 @@ impl ComputeNode {
         config::write_postgres_conf(
             &pgdata_path.join("postgresql.conf"),
             &pspec.spec,
-            Some(extension_server_port),
+            self.http_port,
         )?;
 
         // Syncing safekeepers is only safe with primary nodes: if a primary
@@ -1243,7 +1241,7 @@ impl ComputeNode {
         // Write new config
         let pgdata_path = Path::new(&self.pgdata);
         let postgresql_conf_path = pgdata_path.join("postgresql.conf");
-        config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
+        config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?;
 
         // TODO(ololobus): We need a concurrency during reconfiguration as well,
         // but DB is already running and used by user. We can easily get out of
@@ -1284,10 +1282,7 @@ impl ComputeNode {
     }
 
     #[instrument(skip_all)]
-    pub fn start_compute(
-        &self,
-        extension_server_port: u16,
-    ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
+    pub fn start_compute(&self) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
         let compute_state = self.state.lock().unwrap().clone();
         let pspec = compute_state.pspec.as_ref().expect("spec must be set");
         info!(
@@ -1362,7 +1357,7 @@ impl ComputeNode {
             info!("{:?}", remote_ext_metrics);
         }
 
-        self.prepare_pgdata(&compute_state, extension_server_port)?;
+        self.prepare_pgdata(&compute_state)?;
 
         let start_time = Utc::now();
         let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index d65fe73194..b257c8a68f 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -37,7 +37,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
 pub fn write_postgres_conf(
     path: &Path,
     spec: &ComputeSpec,
-    extension_server_port: Option<u16>,
+    extension_server_port: u16,
 ) -> Result<()> {
     // File::create() destroys the file content if it exists.
     let mut file = File::create(path)?;
@@ -127,9 +127,7 @@ pub fn write_postgres_conf(
         writeln!(file, "# Managed by compute_ctl: end")?;
     }
 
-    if let Some(port) = extension_server_port {
-        writeln!(file, "neon.extension_server_port={}", port)?;
-    }
+    writeln!(file, "neon.extension_server_port={}", extension_server_port)?;
 
     // This is essential to keep this line at the end of the file,
     // because it is intended to override any settings above.

From c4987b0b13b9c311ea0f74169b90071323e14cff Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Mon, 2 Dec 2024 19:46:06 +0100
Subject: [PATCH 162/209] fix(testing): Use 1 MB shared_buffers even with LFC
 (#9969)

## Problem

After enabling LFC in tests and lowering `shared_buffers` we started
having more problems with `test_pg_regress`.

## Summary of changes

Set `shared_buffers` to 1MB to both exercise getPage requests/LFC, and
still have enough room for Postgres to operate. Everything smaller might
be not enough for Postgres under load, and can cause errors like 'no
unpinned buffers available'.

See Konstantin's comment [1] as well.

Fixes #9956

[1]:
https://github.com/neondatabase/neon/issues/9956#issuecomment-2511608097
---
 control_plane/src/endpoint.rs         | 4 ++++
 test_runner/fixtures/neon_fixtures.py | 6 ++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 71514daa7c..1ca6dc43c4 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -310,6 +310,10 @@ impl Endpoint {
         conf.append("wal_log_hints", "off");
         conf.append("max_replication_slots", "10");
         conf.append("hot_standby", "on");
+        // Set to 1MB to both exercise getPage requests/LFC, and still have enough room for
+        // Postgres to operate. Everything smaller might be not enough for Postgres under load,
+        // and can cause errors like 'no unpinned buffers available', see
+        // <https://github.com/neondatabase/neon/issues/9956>
         conf.append("shared_buffers", "1MB");
         conf.append("fsync", "off");
         conf.append("max_connections", "100");
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5709a3b82b..f55f06bebc 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3801,13 +3801,11 @@ class Endpoint(PgProtocol, LogUtils):
                     assert size_to_bytes(size) >= size_to_bytes(
                         "1MB"
                     ), "LFC size cannot be set less than 1MB"
-            # shared_buffers = 512kB to make postgres use LFC intensively
-            # neon.max_file_cache_size and neon.file_cache size limit are
-            # set to 1MB because small LFC is better for testing (helps to find more problems)
             lfc_path_escaped = str(lfc_path).replace("'", "''")
             config_lines = [
-                "shared_buffers = 512kB",
                 f"neon.file_cache_path = '{lfc_path_escaped}'",
+                # neon.max_file_cache_size and neon.file_cache size limits are
+                # set to 1MB because small LFC is better for testing (helps to find more problems)
                 "neon.max_file_cache_size = 1MB",
                 "neon.file_cache_size_limit = 1MB",
             ] + config_lines

From 4a488b3e24af43289a6da6b0640ebcf7c83881f3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 3 Dec 2024 08:59:38 +0000
Subject: [PATCH 163/209] storcon: use proper schedule context during node
 delete (#9958)

## Problem

I was touching `test_storage_controller_node_deletion` because for AZ
scheduling work I was adding a change to the storage controller (kick
secondaries during optimisation) that made a FIXME in this test defunct.
While looking at it I also realized that we can easily fix the way node
deletion currently doesn't use a proper ScheduleContext, using the
iterator type recently added for that purpose.

## Summary of changes

- A testing-only behavior in storage controller where if a secondary
location isn't yet ready during optimisation, it will be actively
polled.
- Remove workaround in `test_storage_controller_node_deletion` that
previously was needed because optimisation would get stuck on cold
secondaries.
- Update node deletion code to use a `TenantShardContextIterator` and
thereby a proper ScheduleContext
---
 storage_controller/src/service.rs             | 112 ++++++++++++++----
 test_runner/regress/test_sharding.py          |   7 ++
 .../regress/test_storage_controller.py        |   8 +-
 3 files changed, 96 insertions(+), 31 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 631fdb4923..52c9c4710d 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5158,34 +5158,38 @@ impl Service {
                 *nodes = Arc::new(nodes_mut);
             }
 
-            for (tenant_shard_id, shard) in tenants {
-                if shard.deref_node(node_id) {
-                    // FIXME: we need to build a ScheduleContext that reflects this shard's peers, otherwise
-                    // it won't properly do anti-affinity.
-                    let mut schedule_context = ScheduleContext::default();
+            for (_tenant_id, mut schedule_context, shards) in
+                TenantShardContextIterator::new(tenants, ScheduleMode::Normal)
+            {
+                for shard in shards {
+                    if shard.deref_node(node_id) {
+                        if let Err(e) = shard.schedule(scheduler, &mut schedule_context) {
+                            // TODO: implement force flag to remove a node even if we can't reschedule
+                            // a tenant
+                            tracing::error!(
+                                "Refusing to delete node, shard {} can't be rescheduled: {e}",
+                                shard.tenant_shard_id
+                            );
+                            return Err(e.into());
+                        } else {
+                            tracing::info!(
+                                "Rescheduled shard {} away from node during deletion",
+                                shard.tenant_shard_id
+                            )
+                        }
 
-                    if let Err(e) = shard.schedule(scheduler, &mut schedule_context) {
-                        // TODO: implement force flag to remove a node even if we can't reschedule
-                        // a tenant
-                        tracing::error!("Refusing to delete node, shard {tenant_shard_id} can't be rescheduled: {e}");
-                        return Err(e.into());
-                    } else {
-                        tracing::info!(
-                            "Rescheduled shard {tenant_shard_id} away from node during deletion"
-                        )
+                        self.maybe_reconcile_shard(shard, nodes);
                     }
 
-                    self.maybe_reconcile_shard(shard, nodes);
+                    // Here we remove an existing observed location for the node we're removing, and it will
+                    // not be re-added by a reconciler's completion because we filter out removed nodes in
+                    // process_result.
+                    //
+                    // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that
+                    // means any reconciles we spawned will know about the node we're deleting, enabling them
+                    // to do live migrations if it's still online.
+                    shard.observed.locations.remove(&node_id);
                 }
-
-                // Here we remove an existing observed location for the node we're removing, and it will
-                // not be re-added by a reconciler's completion because we filter out removed nodes in
-                // process_result.
-                //
-                // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that
-                // means any reconciles we spawned will know about the node we're deleting, enabling them
-                // to do live migrations if it's still online.
-                shard.observed.locations.remove(&node_id);
             }
 
             scheduler.node_remove(node_id);
@@ -6279,6 +6283,14 @@ impl Service {
                             > DOWNLOAD_FRESHNESS_THRESHOLD
                     {
                         tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}");
+
+                        #[cfg(feature = "testing")]
+                        if progress.heatmap_mtime.is_none() {
+                            // No heatmap might mean the attached location has never uploaded one, or that
+                            // the secondary download hasn't happened yet.  This is relatively unusual in the field,
+                            // but fairly common in tests.
+                            self.kick_secondary_download(tenant_shard_id).await;
+                        }
                     } else {
                         // Location looks ready: proceed
                         tracing::info!(
@@ -6293,6 +6305,58 @@ impl Service {
         validated_work
     }
 
+    /// Some aspects of scheduling optimisation wait for secondary locations to be warm.  This
+    /// happens on multi-minute timescales in the field, which is fine because optimisation is meant
+    /// to be a lazy background thing. However, when testing, it is not practical to wait around, so
+    /// we have this helper to move things along faster.
+    #[cfg(feature = "testing")]
+    async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) {
+        let (attached_node, secondary_node) = {
+            let locked = self.inner.read().unwrap();
+            let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
+                return;
+            };
+            let (Some(attached), Some(secondary)) = (
+                shard.intent.get_attached(),
+                shard.intent.get_secondary().first(),
+            ) else {
+                return;
+            };
+            (
+                locked.nodes.get(attached).unwrap().clone(),
+                locked.nodes.get(secondary).unwrap().clone(),
+            )
+        };
+
+        // Make remote API calls to upload + download heatmaps: we ignore errors because this is just
+        // a 'kick' to let scheduling optimisation run more promptly.
+        attached_node
+            .with_client_retries(
+                |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await },
+                &self.config.jwt_token,
+                3,
+                10,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await;
+
+        secondary_node
+            .with_client_retries(
+                |client| async move {
+                    client
+                        .tenant_secondary_download(tenant_shard_id, Some(Duration::from_secs(1)))
+                        .await
+                },
+                &self.config.jwt_token,
+                3,
+                10,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await;
+    }
+
     /// Look for shards which are oversized and in need of splitting
     async fn autosplit_tenants(self: &Arc<Self>) {
         let Some(split_threshold) = self.config.split_threshold else {
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index c86ba0d4ea..30abf91d3a 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -519,6 +519,13 @@ def test_sharding_split_smoke(
     # We will have 2 shards per pageserver once done (including secondaries)
     neon_env_builder.num_pageservers = split_shard_count
 
+    # Two AZs
+    def assign_az(ps_cfg):
+        az = f"az-{(ps_cfg['id'] - 1) % 2}"
+        ps_cfg["availability_zone"] = az
+
+    neon_env_builder.pageserver_config_override = assign_az
+
     # 1MiB stripes: enable getting some meaningful data distribution without
     # writing large quantities of data in this test.  The stripe size is given
     # in number of 8KiB pages.
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index e93e251b4f..685af5caaf 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2253,12 +2253,7 @@ def test_storage_controller_node_deletion(
             assert victim.id not in shard["node_secondary"]
 
     # Reconciles running during deletion should all complete
-    # FIXME: this currently doesn't work because the deletion schedules shards without a proper ScheduleContext, resulting
-    # in states that background_reconcile wants to optimize, but can't proceed with migrations yet because this is a short3
-    # test that hasn't uploaded any heatmaps for secondaries.
-    # In the interim, just do a reconcile_all to enable the consistency check.
-    # env.storage_controller.reconcile_until_idle()
-    env.storage_controller.reconcile_all()
+    env.storage_controller.reconcile_until_idle()
 
     # Controller should pass its own consistency checks
     env.storage_controller.consistency_check()
@@ -2267,7 +2262,6 @@ def test_storage_controller_node_deletion(
     env.storage_controller.stop()
     env.storage_controller.start()
     assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
-    env.storage_controller.reconcile_all()  # FIXME: workaround for optimizations happening on startup, see FIXME above.
     env.storage_controller.consistency_check()
 
 
From 8963ac85f9695f529ded4104d2e34e29bed38b7b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 3 Dec 2024 11:55:13 +0100
Subject: [PATCH 164/209] storcon_cli tenant-describe: include tenant-wide
 information in output (#9899)

Before this PR, the storcon_cli didn't have a way to show the
tenant-wide information of the TenantDescribeResponse.

Sadly, the `Serialize` impl for the tenant config doesn't skip on
`None`, so, the output becomes a bit bloated.
Maybe we can use `skip_serializing_if(Option::is_none)` in the future.
=> https://github.com/neondatabase/neon/issues/9983
---
 control_plane/storcon_cli/src/main.rs          | 16 ++++++++++++++--
 test_runner/regress/test_storage_controller.py |  4 ++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index b7f38c6286..e879424532 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -560,14 +560,26 @@ async fn main() -> anyhow::Result<()> {
                 .await?;
         }
         Command::TenantDescribe { tenant_id } => {
-            let describe_response = storcon_client
+            let TenantDescribeResponse {
+                tenant_id,
+                shards,
+                stripe_size,
+                policy,
+                config,
+            } = storcon_client
                 .dispatch::<(), TenantDescribeResponse>(
                     Method::GET,
                     format!("control/v1/tenant/{tenant_id}"),
                     None,
                 )
                 .await?;
-            let shards = describe_response.shards;
+            println!("Tenant {tenant_id}");
+            let mut table = comfy_table::Table::new();
+            table.add_row(["Policy", &format!("{:?}", policy)]);
+            table.add_row(["Stripe size", &format!("{:?}", stripe_size)]);
+            table.add_row(["Config", &serde_json::to_string_pretty(&config).unwrap()]);
+            println!("{table}");
+            println!("Shards:");
             let mut table = comfy_table::Table::new();
             table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
             for shard in shards {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 685af5caaf..244893a616 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1747,8 +1747,8 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
 
     # Describe a tenant
     tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)])
-    assert len(tenant_lines) == 3 + shard_count * 2
-    assert str(env.initial_tenant) in tenant_lines[3]
+    assert len(tenant_lines) >= 3 + shard_count * 2
+    assert str(env.initial_tenant) in tenant_lines[0]
 
     # Pause changes on a tenant
     storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])

From dd76f1eeeee5eac0f7a606fc136f20df2e9218e1 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 3 Dec 2024 12:03:23 +0100
Subject: [PATCH 165/209] page_service: batching observability & include
 throttled time in smgr metrics (#9870)

This PR

- fixes smgr metrics https://github.com/neondatabase/neon/issues/9925
- adds an additional startup log line logging the current batching
config
- adds a histogram of batch sizes global and per-tenant
- adds a metric exposing the current batching config

The issue described #9925 is that before this PR, request latency was
only observed *after* batching.
This means that smgr latency metrics (most importantly getpage latency)
don't account for
- `wait_lsn` time
- time spent waiting for batch to fill up / the executor stage to pick
up the batch.

The fix is to use a per-request batching timer, like we did before the
initial batching PR.
We funnel those timers through the entire request lifecycle.

I noticed that even before the initial batching changes, we weren't
accounting for the time spent writing & flushing the response to the
wire.
This PR drive-by fixes that deficiency by dropping the timers at the
very end of processing the batch, i.e., after the `pgb.flush()` call.

I was **unable to maintain the behavior that we deduct
time-spent-in-throttle from various latency metrics.
The reason is that we're using a *single* counter in `RequestContext` to
track micros spent in throttle.
But there are *N* metrics timers in the batch, one per request.
As a consequence, the practice of consuming the counter in the drop
handler of each timer no longer works because all but the first timer
will encounter error `close() called on closed state`.
A failed attempt to maintain the current behavior can be found in
https://github.com/neondatabase/neon/pull/9951.

So, this PR remvoes the deduction behavior from all metrics.
I started a discussion on Slack about it the implications this has for
our internal SLO calculation:
https://neondb.slack.com/archives/C033RQ5SPDH/p1732910861704029

# Refs

- fixes https://github.com/neondatabase/neon/issues/9925
- sub-issue https://github.com/neondatabase/neon/issues/9377
- epic: https://github.com/neondatabase/neon/issues/9376
---
 pageserver/src/bin/pageserver.rs              |   3 +-
 pageserver/src/context.rs                     |   5 -
 pageserver/src/context/optional_counter.rs    | 101 ------
 pageserver/src/metrics.rs                     | 249 +++++++-------
 pageserver/src/page_service.rs                | 309 +++++++++++-------
 pageserver/src/pgdatadir_mapping.rs           |  18 +-
 pageserver/src/tenant/throttle.rs             |  17 +-
 pageserver/src/tenant/timeline.rs             |   7 +-
 test_runner/fixtures/metrics.py               |   1 +
 .../pageserver/test_page_service_batching.py  |  28 +-
 .../test_pageserver_getpage_throttle.py       |  31 +-
 11 files changed, 373 insertions(+), 396 deletions(-)
 delete mode 100644 pageserver/src/context/optional_counter.rs

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index a8c2c2e992..31f4370855 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -127,6 +127,7 @@ fn main() -> anyhow::Result<()> {
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
     info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
     info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
+    info!(?conf.page_service_pipelining, "starting with page service pipelining config");
 
     // The tenants directory contains all the pageserver local disk state.
     // Create if not exists and make sure all the contents are durable before proceeding.
@@ -302,7 +303,7 @@ fn start_pageserver(
         pageserver::metrics::tokio_epoll_uring::Collector::new(),
     ))
     .unwrap();
-    pageserver::preinitialize_metrics();
+    pageserver::preinitialize_metrics(conf);
 
     // If any failpoints were set from FAILPOINTS environment variable,
     // print them to the log for debugging purposes
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index 7afcf52cf2..8f2177fe5b 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -91,8 +91,6 @@
 
 use crate::task_mgr::TaskKind;
 
-pub(crate) mod optional_counter;
-
 // The main structure of this module, see module-level comment.
 #[derive(Debug)]
 pub struct RequestContext {
@@ -100,7 +98,6 @@ pub struct RequestContext {
     download_behavior: DownloadBehavior,
     access_stats_behavior: AccessStatsBehavior,
     page_content_kind: PageContentKind,
-    pub micros_spent_throttled: optional_counter::MicroSecondsCounterU32,
 }
 
 /// The kind of access to the page cache.
@@ -158,7 +155,6 @@ impl RequestContextBuilder {
                 download_behavior: DownloadBehavior::Download,
                 access_stats_behavior: AccessStatsBehavior::Update,
                 page_content_kind: PageContentKind::Unknown,
-                micros_spent_throttled: Default::default(),
             },
         }
     }
@@ -172,7 +168,6 @@ impl RequestContextBuilder {
                 download_behavior: original.download_behavior,
                 access_stats_behavior: original.access_stats_behavior,
                 page_content_kind: original.page_content_kind,
-                micros_spent_throttled: Default::default(),
             },
         }
     }
diff --git a/pageserver/src/context/optional_counter.rs b/pageserver/src/context/optional_counter.rs
deleted file mode 100644
index 100c649f18..0000000000
--- a/pageserver/src/context/optional_counter.rs
+++ /dev/null
@@ -1,101 +0,0 @@
-use std::{
-    sync::atomic::{AtomicU32, Ordering},
-    time::Duration,
-};
-
-#[derive(Debug)]
-pub struct CounterU32 {
-    inner: AtomicU32,
-}
-impl Default for CounterU32 {
-    fn default() -> Self {
-        Self {
-            inner: AtomicU32::new(u32::MAX),
-        }
-    }
-}
-impl CounterU32 {
-    pub fn open(&self) -> Result<(), &'static str> {
-        match self
-            .inner
-            .compare_exchange(u32::MAX, 0, Ordering::Relaxed, Ordering::Relaxed)
-        {
-            Ok(_) => Ok(()),
-            Err(_) => Err("open() called on clsoed state"),
-        }
-    }
-    pub fn close(&self) -> Result<u32, &'static str> {
-        match self.inner.swap(u32::MAX, Ordering::Relaxed) {
-            u32::MAX => Err("close() called on closed state"),
-            x => Ok(x),
-        }
-    }
-
-    pub fn add(&self, count: u32) -> Result<(), &'static str> {
-        if count == 0 {
-            return Ok(());
-        }
-        let mut had_err = None;
-        self.inner
-            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| match cur {
-                u32::MAX => {
-                    had_err = Some("add() called on closed state");
-                    None
-                }
-                x => {
-                    let (new, overflowed) = x.overflowing_add(count);
-                    if new == u32::MAX || overflowed {
-                        had_err = Some("add() overflowed the counter");
-                        None
-                    } else {
-                        Some(new)
-                    }
-                }
-            })
-            .map_err(|_| had_err.expect("we set it whenever the function returns None"))
-            .map(|_| ())
-    }
-}
-
-#[derive(Default, Debug)]
-pub struct MicroSecondsCounterU32 {
-    inner: CounterU32,
-}
-
-impl MicroSecondsCounterU32 {
-    pub fn open(&self) -> Result<(), &'static str> {
-        self.inner.open()
-    }
-    pub fn add(&self, duration: Duration) -> Result<(), &'static str> {
-        match duration.as_micros().try_into() {
-            Ok(x) => self.inner.add(x),
-            Err(_) => Err("add(): duration conversion error"),
-        }
-    }
-    pub fn close_and_checked_sub_from(&self, from: Duration) -> Result<Duration, &'static str> {
-        let val = self.inner.close()?;
-        let val = Duration::from_micros(val as u64);
-        let subbed = match from.checked_sub(val) {
-            Some(v) => v,
-            None => return Err("Duration::checked_sub"),
-        };
-        Ok(subbed)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-
-    use super::*;
-
-    #[test]
-    fn test_basic() {
-        let counter = MicroSecondsCounterU32::default();
-        counter.open().unwrap();
-        counter.add(Duration::from_micros(23)).unwrap();
-        let res = counter
-            .close_and_checked_sub_from(Duration::from_micros(42))
-            .unwrap();
-        assert_eq!(res, Duration::from_micros(42 - 23));
-    }
-}
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 86be97587f..d04fae7627 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -7,6 +7,10 @@ use metrics::{
     IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
+use pageserver_api::config::{
+    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
+    PageServiceProtocolPipelinedExecutionStrategy,
+};
 use pageserver_api::shard::TenantShardId;
 use postgres_backend::{is_expected_io_error, QueryError};
 use pq_proto::framed::ConnectionError;
@@ -1216,50 +1220,21 @@ pub(crate) mod virtual_file_io_engine {
     });
 }
 
-struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
-    global_latency_histo: &'a Histogram,
+pub(crate) struct SmgrOpTimer {
+    global_latency_histo: Histogram,
 
     // Optional because not all op types are tracked per-timeline
-    per_timeline_latency_histo: Option<&'a Histogram>,
+    per_timeline_latency_histo: Option<Histogram>,
 
-    ctx: &'c RequestContext,
-    start: std::time::Instant,
-    op: SmgrQueryType,
-    count: usize,
+    start: Instant,
 }
 
-impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> {
+impl Drop for SmgrOpTimer {
     fn drop(&mut self) {
-        let elapsed = self.start.elapsed();
-        let ex_throttled = self
-            .ctx
-            .micros_spent_throttled
-            .close_and_checked_sub_from(elapsed);
-        let ex_throttled = match ex_throttled {
-            Ok(res) => res,
-            Err(error) => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
-                    Lazy::new(|| {
-                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
-                            RateLimit::new(Duration::from_secs(10))
-                        })))
-                    });
-                let mut guard = LOGGED.lock().unwrap();
-                let rate_limit = &mut guard[self.op];
-                rate_limit.call(|| {
-                    warn!(op=?self.op, error, "error deducting time spent throttled; this message is logged at a global rate limit");
-                });
-                elapsed
-            }
-        };
-
-        for _ in 0..self.count {
-            self.global_latency_histo
-                .observe(ex_throttled.as_secs_f64());
-            if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo {
-                per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64());
-            }
+        let elapsed = self.start.elapsed().as_secs_f64();
+        self.global_latency_histo.observe(elapsed);
+        if let Some(per_timeline_getpage_histo) = &self.per_timeline_latency_histo {
+            per_timeline_getpage_histo.observe(elapsed);
         }
     }
 }
@@ -1289,6 +1264,8 @@ pub(crate) struct SmgrQueryTimePerTimeline {
     global_latency: [Histogram; SmgrQueryType::COUNT],
     per_timeline_getpage_started: IntCounter,
     per_timeline_getpage_latency: Histogram,
+    global_batch_size: Histogram,
+    per_timeline_batch_size: Histogram,
 }
 
 static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1381,6 +1358,76 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL: Lazy<Vec<f64>> = Lazy::new(|| {
+    (1..=u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap())
+        .map(|v| v.into())
+        .collect()
+});
+
+static PAGE_SERVICE_BATCH_SIZE_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_page_service_batch_size_global",
+        "Batch size of pageserver page service requests",
+        PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL.clone(),
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE: Lazy<Vec<f64>> = Lazy::new(|| {
+    let mut buckets = Vec::new();
+    for i in 0.. {
+        let bucket = 1 << i;
+        if bucket > u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap() {
+            break;
+        }
+        buckets.push(bucket.into());
+    }
+    buckets
+});
+
+static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_page_service_batch_size",
+        "Batch size of pageserver page service requests",
+        &["tenant_id", "shard_id", "timeline_id"],
+        PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE.clone()
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_page_service_config_max_batch_size",
+        "Configured maximum batch size for the server-side batching functionality of page_service. \
+         Labels expose more of the configuration parameters.",
+        &["mode", "execution"]
+    )
+    .expect("failed to define a metric")
+});
+
+fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
+    PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset();
+    let (label_values, value) = match conf {
+        PageServicePipeliningConfig::Serial => (["serial", "-"], 1),
+        PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
+            max_batch_size,
+            execution,
+        }) => {
+            let mode = "pipelined";
+            let execution = match execution {
+                PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures => {
+                    "concurrent-futures"
+                }
+                PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks",
+            };
+            ([mode, execution], max_batch_size.get())
+        }
+    };
+    PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE
+        .with_label_values(&label_values)
+        .set(value.try_into().unwrap());
+}
+
 impl SmgrQueryTimePerTimeline {
     pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
@@ -1416,78 +1463,51 @@ impl SmgrQueryTimePerTimeline {
             ])
             .unwrap();
 
+        let global_batch_size = PAGE_SERVICE_BATCH_SIZE_GLOBAL.clone();
+        let per_timeline_batch_size = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE
+            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
+            .unwrap();
+
         Self {
             global_started,
             global_latency,
             per_timeline_getpage_latency,
             per_timeline_getpage_started,
+            global_batch_size,
+            per_timeline_batch_size,
         }
     }
-    pub(crate) fn start_timer<'c: 'a, 'a>(
-        &'a self,
-        op: SmgrQueryType,
-        ctx: &'c RequestContext,
-    ) -> Option<impl Drop + 'a> {
-        self.start_timer_many(op, 1, ctx)
-    }
-    pub(crate) fn start_timer_many<'c: 'a, 'a>(
-        &'a self,
-        op: SmgrQueryType,
-        count: usize,
-        ctx: &'c RequestContext,
-    ) -> Option<impl Drop + 'a> {
-        let start = Instant::now();
-
+    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer {
         self.global_started[op as usize].inc();
 
-        // We subtract time spent throttled from the observed latency.
-        match ctx.micros_spent_throttled.open() {
-            Ok(()) => (),
-            Err(error) => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
-                    Lazy::new(|| {
-                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
-                            RateLimit::new(Duration::from_secs(10))
-                        })))
-                    });
-                let mut guard = LOGGED.lock().unwrap();
-                let rate_limit = &mut guard[op];
-                rate_limit.call(|| {
-                    warn!(?op, error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
-                });
-            }
-        }
-
         let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
             self.per_timeline_getpage_started.inc();
-            Some(&self.per_timeline_getpage_latency)
+            Some(self.per_timeline_getpage_latency.clone())
         } else {
             None
         };
 
-        Some(GlobalAndPerTimelineHistogramTimer {
-            global_latency_histo: &self.global_latency[op as usize],
+        SmgrOpTimer {
+            global_latency_histo: self.global_latency[op as usize].clone(),
             per_timeline_latency_histo,
-            ctx,
-            start,
-            op,
-            count,
-        })
+            start: started_at,
+        }
+    }
+
+    pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
+        self.global_batch_size.observe(batch_size as f64);
+        self.per_timeline_batch_size.observe(batch_size as f64);
     }
 }
 
 #[cfg(test)]
 mod smgr_query_time_tests {
+    use std::time::Instant;
+
     use pageserver_api::shard::TenantShardId;
     use strum::IntoEnumIterator;
     use utils::id::{TenantId, TimelineId};
 
-    use crate::{
-        context::{DownloadBehavior, RequestContext},
-        task_mgr::TaskKind,
-    };
-
     // Regression test, we used hard-coded string constants before using an enum.
     #[test]
     fn op_label_name() {
@@ -1531,8 +1551,7 @@ mod smgr_query_time_tests {
             let (pre_global, pre_per_tenant_timeline) = get_counts();
             assert_eq!(pre_per_tenant_timeline, 0);
 
-            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
-            let timer = metrics.start_timer(*op, &ctx);
+            let timer = metrics.start_smgr_op(*op, Instant::now());
             drop(timer);
 
             let (post_global, post_per_tenant_timeline) = get_counts();
@@ -1579,58 +1598,24 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
     }
 });
 
-pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
+pub(crate) struct BasebackupQueryTimeOngoingRecording<'a> {
     parent: &'a BasebackupQueryTime,
-    ctx: &'c RequestContext,
     start: std::time::Instant,
 }
 
 impl BasebackupQueryTime {
-    pub(crate) fn start_recording<'c: 'a, 'a>(
-        &'a self,
-        ctx: &'c RequestContext,
-    ) -> BasebackupQueryTimeOngoingRecording<'a, 'a> {
+    pub(crate) fn start_recording(&self) -> BasebackupQueryTimeOngoingRecording<'_> {
         let start = Instant::now();
-        match ctx.micros_spent_throttled.open() {
-            Ok(()) => (),
-            Err(error) => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<RateLimit>> =
-                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-                let mut rate_limit = LOGGED.lock().unwrap();
-                rate_limit.call(|| {
-                    warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
-                });
-            }
-        }
         BasebackupQueryTimeOngoingRecording {
             parent: self,
-            ctx,
             start,
         }
     }
 }
 
-impl BasebackupQueryTimeOngoingRecording<'_, '_> {
+impl BasebackupQueryTimeOngoingRecording<'_> {
     pub(crate) fn observe<T>(self, res: &Result<T, QueryError>) {
-        let elapsed = self.start.elapsed();
-        let ex_throttled = self
-            .ctx
-            .micros_spent_throttled
-            .close_and_checked_sub_from(elapsed);
-        let ex_throttled = match ex_throttled {
-            Ok(ex_throttled) => ex_throttled,
-            Err(error) => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<RateLimit>> =
-                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-                let mut rate_limit = LOGGED.lock().unwrap();
-                rate_limit.call(|| {
-                    warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit");
-                });
-                elapsed
-            }
-        };
+        let elapsed = self.start.elapsed().as_secs_f64();
         // If you want to change categorize of a specific error, also change it in `log_query_error`.
         let metric = match res {
             Ok(_) => &self.parent.ok,
@@ -1641,7 +1626,7 @@ impl BasebackupQueryTimeOngoingRecording<'_, '_> {
             }
             Err(_) => &self.parent.error,
         };
-        metric.observe(ex_throttled.as_secs_f64());
+        metric.observe(elapsed);
     }
 }
 
@@ -2722,6 +2707,11 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
         ]);
+        let _ = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
     }
 }
 
@@ -2747,10 +2737,12 @@ use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
 use std::time::{Duration, Instant};
 
+use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
 use crate::tenant::tasks::BackgroundLoopKind;
+use crate::tenant::Timeline;
 
 /// Maintain a per timeline gauge in addition to the global gauge.
 pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
@@ -3562,7 +3554,9 @@ pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
         .set(u64::try_from(num_threads.get()).unwrap());
 }
 
-pub fn preinitialize_metrics() {
+pub fn preinitialize_metrics(conf: &'static PageServerConf) {
+    set_page_service_config_max_batch_size(&conf.page_service_pipelining);
+
     // Python tests need these and on some we do alerting.
     //
     // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
@@ -3630,6 +3624,7 @@ pub fn preinitialize_metrics() {
         &WAL_REDO_RECORDS_HISTOGRAM,
         &WAL_REDO_BYTES_HISTOGRAM,
         &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
+        &PAGE_SERVICE_BATCH_SIZE_GLOBAL,
     ]
     .into_iter()
     .for_each(|h| {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 1917e7f5b7..64842aa5b8 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -51,7 +51,7 @@ use crate::auth::check_permission;
 use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::metrics::{self};
+use crate::metrics::{self, SmgrOpTimer};
 use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
 use crate::pgdatadir_mapping::Version;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -540,11 +540,13 @@ impl From<WaitLsnError> for QueryError {
 enum BatchedFeMessage {
     Exists {
         span: Span,
+        timer: SmgrOpTimer,
         shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamExistsRequest,
     },
     Nblocks {
         span: Span,
+        timer: SmgrOpTimer,
         shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamNblocksRequest,
     },
@@ -552,15 +554,17 @@ enum BatchedFeMessage {
         span: Span,
         shard: timeline::handle::Handle<TenantManagerTypes>,
         effective_request_lsn: Lsn,
-        pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
+        pages: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>,
     },
     DbSize {
         span: Span,
+        timer: SmgrOpTimer,
         shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamDbSizeRequest,
     },
     GetSlruSegment {
         span: Span,
+        timer: SmgrOpTimer,
         shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamGetSlruSegmentRequest,
     },
@@ -632,6 +636,8 @@ impl PageServerHandler {
             msg = pgb.read_message() => { msg }
         };
 
+        let received_at = Instant::now();
+
         let copy_data_bytes = match msg? {
             Some(FeMessage::CopyData(bytes)) => bytes,
             Some(FeMessage::Terminate) => {
@@ -660,7 +666,15 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                BatchedFeMessage::Exists { span, shard, req }
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetRelExists, received_at);
+                BatchedFeMessage::Exists {
+                    span,
+                    timer,
+                    shard,
+                    req,
+                }
             }
             PagestreamFeMessage::Nblocks(req) => {
                 let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
@@ -668,7 +682,15 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                BatchedFeMessage::Nblocks { span, shard, req }
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetRelSize, received_at);
+                BatchedFeMessage::Nblocks {
+                    span,
+                    timer,
+                    shard,
+                    req,
+                }
             }
             PagestreamFeMessage::DbSize(req) => {
                 let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
@@ -676,7 +698,15 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                BatchedFeMessage::DbSize { span, shard, req }
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetDbSize, received_at);
+                BatchedFeMessage::DbSize {
+                    span,
+                    timer,
+                    shard,
+                    req,
+                }
             }
             PagestreamFeMessage::GetSlruSegment(req) => {
                 let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
@@ -684,7 +714,15 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                BatchedFeMessage::GetSlruSegment { span, shard, req }
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetSlruSegment, received_at);
+                BatchedFeMessage::GetSlruSegment {
+                    span,
+                    timer,
+                    shard,
+                    req,
+                }
             }
             PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
                 request_lsn,
@@ -728,6 +766,14 @@ impl PageServerHandler {
                         return respond_error!(e.into());
                     }
                 };
+
+                // It's important to start the timer before waiting for the LSN
+                // so that the _started counters are incremented before we do
+                // any serious waiting, e.g., for LSNs.
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetPageAtLsn, received_at);
+
                 let effective_request_lsn = match Self::wait_or_get_last_lsn(
                     &shard,
                     request_lsn,
@@ -747,7 +793,7 @@ impl PageServerHandler {
                     span,
                     shard,
                     effective_request_lsn,
-                    pages: smallvec::smallvec![(rel, blkno)],
+                    pages: smallvec::smallvec![(rel, blkno, timer)],
                 }
             }
         };
@@ -832,88 +878,112 @@ impl PageServerHandler {
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
         // invoke handler function
-        let (handler_results, span): (Vec<Result<PagestreamBeMessage, PageStreamError>>, _) =
-            match batch {
-                BatchedFeMessage::Exists { span, shard, req } => {
-                    fail::fail_point!("ps::handle-pagerequest-message::exists");
-                    (
-                        vec![
-                            self.handle_get_rel_exists_request(&shard, &req, ctx)
-                                .instrument(span.clone())
-                                .await,
-                        ],
-                        span,
-                    )
-                }
-                BatchedFeMessage::Nblocks { span, shard, req } => {
-                    fail::fail_point!("ps::handle-pagerequest-message::nblocks");
-                    (
-                        vec![
-                            self.handle_get_nblocks_request(&shard, &req, ctx)
-                                .instrument(span.clone())
-                                .await,
-                        ],
-                        span,
-                    )
-                }
-                BatchedFeMessage::GetPage {
+        let (handler_results, span): (
+            Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>>,
+            _,
+        ) = match batch {
+            BatchedFeMessage::Exists {
+                span,
+                timer,
+                shard,
+                req,
+            } => {
+                fail::fail_point!("ps::handle-pagerequest-message::exists");
+                (
+                    vec![self
+                        .handle_get_rel_exists_request(&shard, &req, ctx)
+                        .instrument(span.clone())
+                        .await
+                        .map(|msg| (msg, timer))],
                     span,
-                    shard,
-                    effective_request_lsn,
-                    pages,
-                } => {
-                    fail::fail_point!("ps::handle-pagerequest-message::getpage");
-                    (
-                        {
-                            let npages = pages.len();
-                            trace!(npages, "handling getpage request");
-                            let res = self
-                                .handle_get_page_at_lsn_request_batched(
-                                    &shard,
-                                    effective_request_lsn,
-                                    pages,
-                                    ctx,
-                                )
-                                .instrument(span.clone())
-                                .await;
-                            assert_eq!(res.len(), npages);
-                            res
-                        },
-                        span,
-                    )
-                }
-                BatchedFeMessage::DbSize { span, shard, req } => {
-                    fail::fail_point!("ps::handle-pagerequest-message::dbsize");
-                    (
-                        vec![
-                            self.handle_db_size_request(&shard, &req, ctx)
-                                .instrument(span.clone())
-                                .await,
-                        ],
-                        span,
-                    )
-                }
-                BatchedFeMessage::GetSlruSegment { span, shard, req } => {
-                    fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
-                    (
-                        vec![
-                            self.handle_get_slru_segment_request(&shard, &req, ctx)
-                                .instrument(span.clone())
-                                .await,
-                        ],
-                        span,
-                    )
-                }
-                BatchedFeMessage::RespondError { span, error } => {
-                    // We've already decided to respond with an error, so we don't need to
-                    // call the handler.
-                    (vec![Err(error)], span)
-                }
-            };
+                )
+            }
+            BatchedFeMessage::Nblocks {
+                span,
+                timer,
+                shard,
+                req,
+            } => {
+                fail::fail_point!("ps::handle-pagerequest-message::nblocks");
+                (
+                    vec![self
+                        .handle_get_nblocks_request(&shard, &req, ctx)
+                        .instrument(span.clone())
+                        .await
+                        .map(|msg| (msg, timer))],
+                    span,
+                )
+            }
+            BatchedFeMessage::GetPage {
+                span,
+                shard,
+                effective_request_lsn,
+                pages,
+            } => {
+                fail::fail_point!("ps::handle-pagerequest-message::getpage");
+                (
+                    {
+                        let npages = pages.len();
+                        trace!(npages, "handling getpage request");
+                        let res = self
+                            .handle_get_page_at_lsn_request_batched(
+                                &shard,
+                                effective_request_lsn,
+                                pages,
+                                ctx,
+                            )
+                            .instrument(span.clone())
+                            .await;
+                        assert_eq!(res.len(), npages);
+                        res
+                    },
+                    span,
+                )
+            }
+            BatchedFeMessage::DbSize {
+                span,
+                timer,
+                shard,
+                req,
+            } => {
+                fail::fail_point!("ps::handle-pagerequest-message::dbsize");
+                (
+                    vec![self
+                        .handle_db_size_request(&shard, &req, ctx)
+                        .instrument(span.clone())
+                        .await
+                        .map(|msg| (msg, timer))],
+                    span,
+                )
+            }
+            BatchedFeMessage::GetSlruSegment {
+                span,
+                timer,
+                shard,
+                req,
+            } => {
+                fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
+                (
+                    vec![self
+                        .handle_get_slru_segment_request(&shard, &req, ctx)
+                        .instrument(span.clone())
+                        .await
+                        .map(|msg| (msg, timer))],
+                    span,
+                )
+            }
+            BatchedFeMessage::RespondError { span, error } => {
+                // We've already decided to respond with an error, so we don't need to
+                // call the handler.
+                (vec![Err(error)], span)
+            }
+        };
 
         // Map handler result to protocol behavior.
         // Some handler errors cause exit from pagestream protocol.
         // Other handler errors are sent back as an error message and we stay in pagestream protocol.
+        let mut timers: smallvec::SmallVec<[_; 1]> =
+            smallvec::SmallVec::with_capacity(handler_results.len());
         for handler_result in handler_results {
             let response_msg = match handler_result {
                 Err(e) => match &e {
@@ -944,7 +1014,12 @@ impl PageServerHandler {
                         })
                     }
                 },
-                Ok(response_msg) => response_msg,
+                Ok((response_msg, timer)) => {
+                    // Extending the lifetime of the timers so observations on drop
+                    // include the flush time.
+                    timers.push(timer);
+                    response_msg
+                }
             };
 
             // marshal & transmit response message
@@ -961,6 +1036,7 @@ impl PageServerHandler {
                 res?;
             }
         }
+        drop(timers);
         Ok(())
     }
 
@@ -1423,10 +1499,6 @@ impl PageServerHandler {
         req: &PagestreamExistsRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let _timer = timeline
-            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetRelExists, ctx);
-
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
@@ -1453,10 +1525,6 @@ impl PageServerHandler {
         req: &PagestreamNblocksRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let _timer = timeline
-            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetRelSize, ctx);
-
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
@@ -1483,10 +1551,6 @@ impl PageServerHandler {
         req: &PagestreamDbSizeRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let _timer = timeline
-            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetDbSize, ctx);
-
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
@@ -1512,26 +1576,41 @@ impl PageServerHandler {
         &mut self,
         timeline: &Timeline,
         effective_lsn: Lsn,
-        pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
+        requests: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>,
         ctx: &RequestContext,
-    ) -> Vec<Result<PagestreamBeMessage, PageStreamError>> {
+    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>> {
         debug_assert_current_span_has_tenant_and_timeline_id();
-        let _timer = timeline.query_metrics.start_timer_many(
-            metrics::SmgrQueryType::GetPageAtLsn,
-            pages.len(),
-            ctx,
-        );
 
-        let pages = timeline
-            .get_rel_page_at_lsn_batched(pages, effective_lsn, ctx)
+        timeline
+            .query_metrics
+            .observe_getpage_batch_start(requests.len());
+
+        let results = timeline
+            .get_rel_page_at_lsn_batched(
+                requests.iter().map(|(reltag, blkno, _)| (reltag, blkno)),
+                effective_lsn,
+                ctx,
+            )
             .await;
+        assert_eq!(results.len(), requests.len());
 
-        Vec::from_iter(pages.into_iter().map(|page| {
-            page.map(|page| {
-                PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse { page })
-            })
-            .map_err(PageStreamError::from)
-        }))
+        // TODO: avoid creating the new Vec here
+        Vec::from_iter(
+            requests
+                .into_iter()
+                .zip(results.into_iter())
+                .map(|((_, _, timer), res)| {
+                    res.map(|page| {
+                        (
+                            PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse {
+                                page,
+                            }),
+                            timer,
+                        )
+                    })
+                    .map_err(PageStreamError::from)
+                }),
+        )
     }
 
     #[instrument(skip_all, fields(shard_id))]
@@ -1541,10 +1620,6 @@ impl PageServerHandler {
         req: &PagestreamGetSlruSegmentRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let _timer = timeline
-            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);
-
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
@@ -2045,7 +2120,7 @@ where
                 COMPUTE_COMMANDS_COUNTERS
                     .for_command(ComputeCommandKind::Basebackup)
                     .inc();
-                let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
+                let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording();
                 let res = async {
                     self.handle_basebackup_request(
                         pgb,
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index d48a1ba117..a00ec761e2 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -203,9 +203,13 @@ impl Timeline {
     ) -> Result<Bytes, PageReconstructError> {
         match version {
             Version::Lsn(effective_lsn) => {
-                let pages = smallvec::smallvec![(tag, blknum)];
+                let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
                 let res = self
-                    .get_rel_page_at_lsn_batched(pages, effective_lsn, ctx)
+                    .get_rel_page_at_lsn_batched(
+                        pages.iter().map(|(tag, blknum)| (tag, blknum)),
+                        effective_lsn,
+                        ctx,
+                    )
                     .await;
                 assert_eq!(res.len(), 1);
                 res.into_iter().next().unwrap()
@@ -240,7 +244,7 @@ impl Timeline {
     /// The ordering of the returned vec corresponds to the ordering of `pages`.
     pub(crate) async fn get_rel_page_at_lsn_batched(
         &self,
-        pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
+        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber)>,
         effective_lsn: Lsn,
         ctx: &RequestContext,
     ) -> Vec<Result<Bytes, PageReconstructError>> {
@@ -254,7 +258,7 @@ impl Timeline {
         let result_slots = result.spare_capacity_mut();
 
         let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[usize; 1]>> = BTreeMap::default();
-        for (response_slot_idx, (tag, blknum)) in pages.into_iter().enumerate() {
+        for (response_slot_idx, (tag, blknum)) in pages.enumerate() {
             if tag.relnode == 0 {
                 result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
                     RelationError::InvalidRelnode.into(),
@@ -265,7 +269,7 @@ impl Timeline {
             }
 
             let nblocks = match self
-                .get_rel_size(tag, Version::Lsn(effective_lsn), ctx)
+                .get_rel_size(*tag, Version::Lsn(effective_lsn), ctx)
                 .await
             {
                 Ok(nblocks) => nblocks,
@@ -276,7 +280,7 @@ impl Timeline {
                 }
             };
 
-            if blknum >= nblocks {
+            if *blknum >= nblocks {
                 debug!(
                     "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
                     tag, blknum, effective_lsn, nblocks
@@ -286,7 +290,7 @@ impl Timeline {
                 continue;
             }
 
-            let key = rel_block_to_key(tag, blknum);
+            let key = rel_block_to_key(*tag, *blknum);
 
             let key_slots = keys_slots.entry(key).or_default();
             key_slots.push(response_slot_idx);
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index 6a80953901..7c4de55a47 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -2,14 +2,14 @@ use std::{
     str::FromStr,
     sync::{
         atomic::{AtomicU64, Ordering},
-        Arc, Mutex,
+        Arc,
     },
     time::{Duration, Instant},
 };
 
 use arc_swap::ArcSwap;
 use enumset::EnumSet;
-use tracing::{error, warn};
+use tracing::error;
 use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter};
 
 use crate::{context::RequestContext, task_mgr::TaskKind};
@@ -162,19 +162,6 @@ where
                 .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
             let observation = Observation { wait_time };
             self.metric.observe_throttling(&observation);
-            match ctx.micros_spent_throttled.add(wait_time) {
-                Ok(res) => res,
-                Err(error) => {
-                    use once_cell::sync::Lazy;
-                    use utils::rate_limit::RateLimit;
-                    static WARN_RATE_LIMIT: Lazy<Mutex<RateLimit>> =
-                        Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-                    let mut guard = WARN_RATE_LIMIT.lock().unwrap();
-                    guard.call(move || {
-                        warn!(error, "error adding time spent throttled; this message is logged at a global rate limit");
-                    });
-                }
-            }
             Some(wait_time)
         } else {
             None
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 730477a7f4..dc3f823f20 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1059,7 +1059,8 @@ impl Timeline {
             .map(|metric| (metric, Instant::now()));
 
         // start counting after throttle so that throttle time
-        // is always less than observation time
+        // is always less than observation time and we don't
+        // underflow when computing `ex_throttled` below.
         let throttled = self
             .timeline_get_throttle
             .throttle(ctx, key_count as usize)
@@ -1138,7 +1139,9 @@ impl Timeline {
             .map(ScanLatencyOngoingRecording::start_recording);
 
         // start counting after throttle so that throttle time
-        // is always less than observation time
+        // is always less than observation time and we don't
+        // underflow when computing the `ex_throttled` value in
+        // `recording.observe(throttled)` below.
         let throttled = self
             .timeline_get_throttle
             // assume scan = 1 quota for now until we find a better way to process this
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 3f90c233a6..ffdbd988a5 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -173,6 +173,7 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
     counter("pageserver_tenant_throttling_count_accounted_finish"),
     counter("pageserver_tenant_throttling_wait_usecs_sum"),
     counter("pageserver_tenant_throttling_count"),
+    *histogram("pageserver_page_service_batch_size"),
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold
     # "pageserver_broken_tenants_count" -- used only for broken
diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py
index c47a849fec..562094a059 100644
--- a/test_runner/performance/pageserver/test_page_service_batching.py
+++ b/test_runner/performance/pageserver/test_page_service_batching.py
@@ -167,18 +167,18 @@ def test_throughput(
     @dataclass
     class Metrics:
         time: float
-        pageserver_getpage_count: float
-        pageserver_vectored_get_count: float
+        pageserver_batch_size_histo_sum: float
+        pageserver_batch_size_histo_count: float
         compute_getpage_count: float
         pageserver_cpu_seconds_total: float
 
         def __sub__(self, other: "Metrics") -> "Metrics":
             return Metrics(
                 time=self.time - other.time,
-                pageserver_getpage_count=self.pageserver_getpage_count
-                - other.pageserver_getpage_count,
-                pageserver_vectored_get_count=self.pageserver_vectored_get_count
-                - other.pageserver_vectored_get_count,
+                pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum
+                - other.pageserver_batch_size_histo_sum,
+                pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count
+                - other.pageserver_batch_size_histo_count,
                 compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count,
                 pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total
                 - other.pageserver_cpu_seconds_total,
@@ -187,8 +187,8 @@ def test_throughput(
         def normalize(self, by) -> "Metrics":
             return Metrics(
                 time=self.time / by,
-                pageserver_getpage_count=self.pageserver_getpage_count / by,
-                pageserver_vectored_get_count=self.pageserver_vectored_get_count / by,
+                pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum / by,
+                pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count / by,
                 compute_getpage_count=self.compute_getpage_count / by,
                 pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by,
             )
@@ -202,11 +202,11 @@ def test_throughput(
             pageserver_metrics = ps_http.get_metrics()
             return Metrics(
                 time=time.time(),
-                pageserver_getpage_count=pageserver_metrics.query_one(
-                    "pageserver_smgr_query_seconds_count", {"smgr_query_type": "get_page_at_lsn"}
+                pageserver_batch_size_histo_sum=pageserver_metrics.query_one(
+                    "pageserver_page_service_batch_size_sum"
                 ).value,
-                pageserver_vectored_get_count=pageserver_metrics.query_one(
-                    "pageserver_get_vectored_seconds_count", {"task_kind": "PageRequestHandler"}
+                pageserver_batch_size_histo_count=pageserver_metrics.query_one(
+                    "pageserver_page_service_batch_size_count"
                 ).value,
                 compute_getpage_count=compute_getpage_count,
                 pageserver_cpu_seconds_total=pageserver_metrics.query_one(
@@ -243,7 +243,7 @@ def test_throughput(
     # Sanity-checks on the collected data
     #
     # assert that getpage counts roughly match between compute and ps
-    assert metrics.pageserver_getpage_count == pytest.approx(
+    assert metrics.pageserver_batch_size_histo_sum == pytest.approx(
         metrics.compute_getpage_count, rel=0.01
     )
 
@@ -256,7 +256,7 @@ def test_throughput(
 
     zenbenchmark.record(
         "perfmetric.batching_factor",
-        metrics.pageserver_getpage_count / metrics.pageserver_vectored_get_count,
+        metrics.pageserver_batch_size_histo_sum / metrics.pageserver_batch_size_histo_count,
         unit="",
         report=MetricReport.HIGHER_IS_BETTER,
     )
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index ba6a1d9045..62aec50a9e 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -4,6 +4,7 @@ import copy
 import json
 import uuid
 
+import pytest
 from anyio import Path
 from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
@@ -70,14 +71,21 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
 
     log.info("warmup / make sure metrics are present")
     run_pagebench_at_max_speed_and_get_total_requests_completed(2)
-    metrics_query = {
+    smgr_metrics_query = {
         "tenant_id": str(tenant_id),
         "timeline_id": str(timeline_id),
         "smgr_query_type": "get_page_at_lsn",
     }
-    metric_name = "pageserver_smgr_query_seconds_sum"
-    smgr_query_seconds_pre = ps_http.get_metric_value(metric_name, metrics_query)
+    smgr_metric_name = "pageserver_smgr_query_seconds_sum"
+    throttle_metrics_query = {
+        "tenant_id": str(tenant_id),
+    }
+    throttle_metric_name = "pageserver_tenant_throttling_wait_usecs_sum_total"
+
+    smgr_query_seconds_pre = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
     assert smgr_query_seconds_pre is not None
+    throttled_usecs_pre = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
+    assert throttled_usecs_pre is not None
 
     marker = uuid.uuid4().hex
     ps_http.post_tracing_event("info", marker)
@@ -108,14 +116,23 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
         timeout=compaction_period,
     )
 
-    log.info("validate that the metric doesn't include throttle wait time")
-    smgr_query_seconds_post = ps_http.get_metric_value(metric_name, metrics_query)
+    log.info("the smgr metric includes throttle time")
+    smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
     assert smgr_query_seconds_post is not None
+    throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
+    assert throttled_usecs_post is not None
     actual_smgr_query_seconds = smgr_query_seconds_post - smgr_query_seconds_pre
+    actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre
+    actual_throttled_secs = actual_throttled_usecs / 1_000_000
 
     assert (
-        duration_secs >= 10 * actual_smgr_query_seconds
-    ), "smgr metrics should not include throttle wait time"
+        pytest.approx(duration_secs, 0.1) == actual_smgr_query_seconds
+    ), "smgr metrics include throttle wait time"
+    smgr_ex_throttle = actual_smgr_query_seconds - actual_throttled_secs
+    assert smgr_ex_throttle > 0
+    assert (
+        duration_secs > 10 * smgr_ex_throttle
+    ), "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"
 
 
 throttle_config_with_field_fair_set = {

From 85b12ddd528c827a72f0928a0689edc44e377a06 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 3 Dec 2024 12:25:29 +0100
Subject: [PATCH 166/209] Add support for the extensions test for Postgres v17
 (#9748)

## Problem
The extensions for Postgres v17 are ready but we do not test the
extensions shipped with v17
## Summary of changes
Build the test image based on Postgres v17. Run the tests for v17.

---------

Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
---
 .github/workflows/build_and_test.yml          |  15 +-
 compute/compute-node.Dockerfile               |  42 ++---
 ...hint_plan.patch => pg_hint_plan_v16.patch} |   0
 compute/patches/pg_hint_plan_v17.patch        | 174 ++++++++++++++++++
 docker-compose/compute_wrapper/Dockerfile     |   6 +-
 docker-compose/docker_compose_test.sh         |  27 ++-
 6 files changed, 219 insertions(+), 45 deletions(-)
 rename compute/patches/{pg_hint_plan.patch => pg_hint_plan_v16.patch} (100%)
 create mode 100644 compute/patches/pg_hint_plan_v17.patch

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9830c2a0c9..e9e111e7bd 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -669,7 +669,7 @@ jobs:
             neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
 
       - name: Build neon extensions test image
-        if: matrix.version.pg == 'v16'
+        if: matrix.version.pg >= 'v16'
         uses: docker/build-push-action@v6
         with:
           context: .
@@ -684,8 +684,7 @@ jobs:
           pull: true
           file: compute/compute-node.Dockerfile
           target: neon-pg-ext-test
-          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
+          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
           tags: |
             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
 
@@ -708,7 +707,7 @@ jobs:
           push: true
           pull: true
           file: compute/compute-node.Dockerfile
-          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
+          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
           cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
           tags: |
             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
@@ -744,7 +743,7 @@ jobs:
                                              neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
 
       - name: Create multi-arch neon-test-extensions image
-        if: matrix.version.pg == 'v16'
+        if: matrix.version.pg >= 'v16'
         run: |
           docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
                                           -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
@@ -833,6 +832,7 @@ jobs:
       fail-fast: false
       matrix:
         arch: [ x64, arm64 ]
+        pg_version: [v16, v17]
 
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
 
@@ -871,7 +871,10 @@ jobs:
 
       - name: Verify docker-compose example and test extensions
         timeout-minutes: 20
-        run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
+        env:
+          TAG: ${{needs.tag.outputs.build-tag}}
+          TEST_VERSION_ONLY: ${{ matrix.pg_version }}
+        run: ./docker-compose/docker_compose_test.sh
 
       - name: Print logs and clean up
         if: always()
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 222a0cb88b..bf6311bf2b 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1367,15 +1367,12 @@ RUN make PG_VERSION="${PG_VERSION}" -C compute
 
 FROM neon-pg-ext-build AS neon-pg-ext-test
 ARG PG_VERSION
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    mkdir /ext-src
+RUN mkdir /ext-src
 
 #COPY --from=postgis-build /postgis.tar.gz /ext-src/
 #COPY --from=postgis-build /sfcgal/* /usr
 COPY --from=plv8-build /plv8.tar.gz /ext-src/
-COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
+#COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
 COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
 COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
 COPY --from=vector-pg-build /pgvector.patch /ext-src/
@@ -1395,7 +1392,7 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
 COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
 #COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
 COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
-COPY compute/patches/pg_hint_plan.patch /ext-src
+COPY compute/patches/pg_hint_plan_${PG_VERSION}.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY compute/patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
@@ -1405,38 +1402,23 @@ COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
 COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
 #COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
 #COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
-COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
+#pg_anon is not supported yet for pg v17 so, don't fail if nothing found
+COPY --from=pg-anon-pg-build /pg_anon.tar.g? /ext-src
 COPY compute/patches/pg_anon.patch /ext-src
 COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
 COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd /ext-src/ && for f in *.tar.gz; \
+RUN cd /ext-src/ && for f in *.tar.gz; \
     do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
     rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
     || exit 1; rm -f $f; done
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd /ext-src/rum-src && patch -p1 <../rum.patch
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
+RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
+RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
+RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    patch -p1 </ext-src/pg_anon.patch
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    patch -p1 </ext-src/pg_cron.patch
+    echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
+    esac && patch -p1 </ext-src/pg_anon.patch
+RUN patch -p1 </ext-src/pg_cron.patch
 ENV PATH=/usr/local/pgsql/bin:$PATH
 ENV PGHOST=compute
 ENV PGPORT=55433
diff --git a/compute/patches/pg_hint_plan.patch b/compute/patches/pg_hint_plan_v16.patch
similarity index 100%
rename from compute/patches/pg_hint_plan.patch
rename to compute/patches/pg_hint_plan_v16.patch
diff --git a/compute/patches/pg_hint_plan_v17.patch b/compute/patches/pg_hint_plan_v17.patch
new file mode 100644
index 0000000000..dbf4e470ea
--- /dev/null
+++ b/compute/patches/pg_hint_plan_v17.patch
@@ -0,0 +1,174 @@
+diff --git a/expected/ut-A.out b/expected/ut-A.out
+index e7d68a1..65a056c 100644
+--- a/expected/ut-A.out
++++ b/expected/ut-A.out
+@@ -9,13 +9,16 @@ SET search_path TO public;
+ ----
+ -- No.A-1-1-3
+ CREATE EXTENSION pg_hint_plan;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+ -- No.A-1-2-3
+ DROP EXTENSION pg_hint_plan;
+ -- No.A-1-1-4
+ CREATE SCHEMA other_schema;
+ CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+ ERROR:  extension "pg_hint_plan" must be installed in schema "hint_plan"
+ CREATE EXTENSION pg_hint_plan;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+ DROP SCHEMA other_schema;
+ ----
+ ---- No. A-5-1 comment pattern
+diff --git a/expected/ut-J.out b/expected/ut-J.out
+index 2fa3c70..314e929 100644
+--- a/expected/ut-J.out
++++ b/expected/ut-J.out
+@@ -789,38 +789,6 @@ NestLoop(st1 st2)
+ MergeJoin(t1 t2)
+ not used hint:
+ duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-NestLoop(st1 st2)
+-MergeJoin(t1 t2)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-NestLoop(st1 st2)
+-MergeJoin(t1 t2)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-NestLoop(st1 st2)
+-MergeJoin(t1 t2)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-NestLoop(st1 st2)
+-MergeJoin(t1 t2)
+-duplication hint:
+ error hint:
+ 
+                                        explain_filter                                        
+diff --git a/expected/ut-S.out b/expected/ut-S.out
+index 0bfcfb8..e75f581 100644
+--- a/expected/ut-S.out
++++ b/expected/ut-S.out
+@@ -4415,34 +4415,6 @@ used hint:
+ IndexScan(ti1 ti1_pred)
+ not used hint:
+ duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(ti1 ti1_pred)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(ti1 ti1_pred)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(ti1 ti1_pred)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(ti1 ti1_pred)
+-duplication hint:
+ error hint:
+ 
+                     explain_filter                     
+diff --git a/expected/ut-W.out b/expected/ut-W.out
+index a09bd34..0ad227c 100644
+--- a/expected/ut-W.out
++++ b/expected/ut-W.out
+@@ -1341,54 +1341,6 @@ IndexScan(ft1)
+ IndexScan(t)
+ Parallel(s1 3 hard)
+ duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(*VALUES*)
+-SeqScan(cte1)
+-IndexScan(ft1)
+-IndexScan(t)
+-Parallel(p1 5 hard)
+-Parallel(s1 3 hard)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(*VALUES*)
+-SeqScan(cte1)
+-IndexScan(ft1)
+-IndexScan(t)
+-Parallel(p1 5 hard)
+-Parallel(s1 3 hard)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(*VALUES*)
+-SeqScan(cte1)
+-IndexScan(ft1)
+-IndexScan(t)
+-Parallel(p1 5 hard)
+-Parallel(s1 3 hard)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(*VALUES*)
+-SeqScan(cte1)
+-IndexScan(ft1)
+-IndexScan(t)
+-Parallel(p1 5 hard)
+-Parallel(s1 3 hard)
+-duplication hint:
+ error hint:
+ 
+                     explain_filter                    
+diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out
+index 017fa4b..98d989b 100644
+--- a/expected/ut-fdw.out
++++ b/expected/ut-fdw.out
+@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
+ SET client_min_messages TO LOG;
+ SET pg_hint_plan.enable_hint TO on;
+ CREATE EXTENSION file_fdw;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
+ CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
+ CREATE USER MAPPING FOR PUBLIC SERVER file_server;
+ CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile
index 8378f37b48..05a2cf124c 100644
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -4,14 +4,16 @@ ARG TAG=latest
 
 FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG
 
+ARG COMPUTE_IMAGE
+
 USER root
 RUN apt-get update &&       \
     apt-get install -y curl \
                        jq   \
                        python3-pip \
-                       netcat
+                       netcat-openbsd
 #Faker is required for the pg_anon test
-RUN pip3 install Faker
+RUN case $COMPUTE_IMAGE in compute-node-v17) OPT="--break-system-packages";; *) OPT= ;; esac && pip3 install $OPT Faker
 #This is required for the pg_hintplan test
 RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src 
 
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index 10805a9952..c97dfaa901 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -30,10 +30,17 @@ cleanup() {
     docker compose --profile test-extensions -f $COMPOSE_FILE down
 }
 
-for pg_version in 14 15 16; do
+for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
+    pg_version=${pg_version/v/}
     echo "clean up containers if exists"
     cleanup
-    PG_TEST_VERSION=$(($pg_version < 16 ? 16 : $pg_version))
+    PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version))
+    # The support of pg_anon not yet added to PG17, so we have to remove the corresponding option
+    if [ $pg_version -eq 17 ]; then
+      SPEC_PATH="compute_wrapper/var/db/postgres/specs"
+      mv $SPEC_PATH/spec.json $SPEC_PATH/spec.bak
+      jq 'del(.cluster.settings[] | select (.name == "session_preload_libraries"))' $SPEC_PATH/spec.bak > $SPEC_PATH/spec.json
+    fi
     PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d
 
     echo "wait until the compute is ready. timeout after 60s. "
@@ -54,8 +61,7 @@ for pg_version in 14 15 16; do
         fi
     done
 
-    if [ $pg_version -ge 16 ]
-    then
+    if [ $pg_version -ge 16 ]; then
         echo Enabling trust connection
         docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' "
         echo Adding postgres role
@@ -68,10 +74,13 @@ for pg_version in 14 15 16; do
         # The test assumes that it is running on the same host with the postgres engine.
         # In our case it's not true, that's why we are copying files to the compute node
         TMPDIR=$(mktemp -d)
-        docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
-        echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
-        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
+        # Add support for pg_anon for pg_v16
+        if [ $pg_version -ne 17 ]; then
+          docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
+          echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
+          docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
         rm -rf $TMPDIR
+        fi
         TMPDIR=$(mktemp -d)
         # The following block does the same for the pg_hintplan test
         docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
@@ -97,4 +106,8 @@ for pg_version in 14 15 16; do
         fi
     fi
     cleanup
+    # The support of pg_anon not yet added to PG17, so we have to remove the corresponding option
+    if [ $pg_version -eq 17 ]; then
+      mv $SPEC_PATH/spec.bak $SPEC_PATH/spec.json
+    fi
 done

From 0a2a84b766e78ae7288ac3b2624102776a28eea0 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 3 Dec 2024 12:35:59 +0100
Subject: [PATCH 167/209] safekeeper,pageserver: add heap profiling (#9778)

## Problem

We don't have good observability for memory usage. This would be useful
e.g. to debug OOM incidents or optimize performance or resource usage.

We would also like to use continuous profiling with e.g. [Grafana Cloud
Profiles](https://grafana.com/products/cloud/profiles-for-continuous-profiling/)
(see https://github.com/neondatabase/cloud/issues/14888).

This PR is intended as a proof of concept, to try it out in staging and
drive further discussions about profiling more broadly.

Touches https://github.com/neondatabase/neon/issues/9534.
Touches https://github.com/neondatabase/cloud/issues/14888.
Depends on #9779.
Depends on #9780.

## Summary of changes

Adds a HTTP route `/profile/heap` that takes a heap profile and returns
it. Query parameters:

* `format`: output format (`jemalloc` or `pprof`; default `pprof`).

Unlike CPU profiles (see #9764), heap profiles are not symbolized and
require the original binary to translate addresses to function names. To
make this work with Grafana, we'll probably have to symbolize the
process server-side -- this is left as future work, as is other output
formats like SVG.

Heap profiles don't work on macOS due to limitations in jemalloc.
---
 Cargo.lock                        | 89 ++++++++++++++++++++++++-------
 Cargo.toml                        |  3 +-
 libs/utils/Cargo.toml             |  1 +
 libs/utils/src/http/endpoint.rs   | 64 ++++++++++++++++++++++
 pageserver/src/bin/pageserver.rs  |  5 ++
 pageserver/src/http/routes.rs     |  8 +--
 safekeeper/benches/receive_wal.rs |  6 +++
 safekeeper/src/bin/safekeeper.rs  |  5 ++
 safekeeper/src/http/routes.rs     |  7 ++-
 workspace_hack/Cargo.toml         | 15 ++++--
 10 files changed, 175 insertions(+), 28 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ba02e3b11d..b2769e59f0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -301,7 +301,7 @@ dependencies = [
  "aws-smithy-types",
  "aws-types",
  "bytes",
- "fastrand 2.0.0",
+ "fastrand 2.2.0",
  "hex",
  "http 0.2.9",
  "hyper 0.14.30",
@@ -341,7 +341,7 @@ dependencies = [
  "aws-smithy-types",
  "aws-types",
  "bytes",
- "fastrand 2.0.0",
+ "fastrand 2.2.0",
  "http 0.2.9",
  "http-body 0.4.5",
  "once_cell",
@@ -417,7 +417,7 @@ dependencies = [
  "aws-smithy-xml",
  "aws-types",
  "bytes",
- "fastrand 2.0.0",
+ "fastrand 2.2.0",
  "hex",
  "hmac",
  "http 0.2.9",
@@ -621,7 +621,7 @@ dependencies = [
  "aws-smithy-runtime-api",
  "aws-smithy-types",
  "bytes",
- "fastrand 2.0.0",
+ "fastrand 2.2.0",
  "h2 0.3.26",
  "http 0.2.9",
  "http-body 0.4.5",
@@ -2054,9 +2054,9 @@ dependencies = [
 
 [[package]]
 name = "fastrand"
-version = "2.0.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764"
+checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4"
 
 [[package]]
 name = "ff"
@@ -2912,6 +2912,23 @@ version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
 
+[[package]]
+name = "jemalloc_pprof"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb"
+dependencies = [
+ "anyhow",
+ "libc",
+ "mappings",
+ "once_cell",
+ "pprof_util",
+ "tempfile",
+ "tikv-jemalloc-ctl",
+ "tokio",
+ "tracing",
+]
+
 [[package]]
 name = "jobserver"
 version = "0.1.32"
@@ -3022,9 +3039,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.150"
+version = "0.2.167"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
+checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc"
 
 [[package]]
 name = "libloading"
@@ -3044,9 +3061,9 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.13"
+version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
+checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
 
 [[package]]
 name = "linux-raw-sys"
@@ -3079,6 +3096,19 @@ dependencies = [
  "hashbrown 0.14.5",
 ]
 
+[[package]]
+name = "mappings"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e"
+dependencies = [
+ "anyhow",
+ "libc",
+ "once_cell",
+ "pprof_util",
+ "tracing",
+]
+
 [[package]]
 name = "matchers"
 version = "0.1.0"
@@ -3346,6 +3376,7 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af"
 dependencies = [
+ "num-bigint",
  "num-complex",
  "num-integer",
  "num-iter",
@@ -3434,6 +3465,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
 dependencies = [
  "autocfg",
+ "num-bigint",
  "num-integer",
  "num-traits",
 ]
@@ -3497,9 +3529,9 @@ dependencies = [
 
 [[package]]
 name = "once_cell"
-version = "1.18.0"
+version = "1.20.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
+checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
 
 [[package]]
 name = "oorandom"
@@ -4298,6 +4330,19 @@ dependencies = [
  "thiserror",
 ]
 
+[[package]]
+name = "pprof_util"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781"
+dependencies = [
+ "anyhow",
+ "flate2",
+ "num",
+ "paste",
+ "prost",
+]
+
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -5220,14 +5265,14 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.38.28"
+version = "0.38.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316"
+checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6"
 dependencies = [
  "bitflags 2.4.1",
  "errno",
  "libc",
- "linux-raw-sys 0.4.13",
+ "linux-raw-sys 0.4.14",
  "windows-sys 0.52.0",
 ]
 
@@ -6251,13 +6296,13 @@ dependencies = [
 
 [[package]]
 name = "tempfile"
-version = "3.9.0"
+version = "3.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
+checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c"
 dependencies = [
  "cfg-if",
- "fastrand 2.0.0",
- "redox_syscall 0.4.1",
+ "fastrand 2.2.0",
+ "once_cell",
  "rustix",
  "windows-sys 0.52.0",
 ]
@@ -7058,6 +7103,7 @@ dependencies = [
  "hex-literal",
  "humantime",
  "hyper 0.14.30",
+ "jemalloc_pprof",
  "jsonwebtoken",
  "metrics",
  "nix 0.27.1",
@@ -7644,8 +7690,12 @@ dependencies = [
  "memchr",
  "nix 0.26.4",
  "nom",
+ "num",
  "num-bigint",
+ "num-complex",
  "num-integer",
+ "num-iter",
+ "num-rational",
  "num-traits",
  "once_cell",
  "parquet",
@@ -7669,6 +7719,7 @@ dependencies = [
  "subtle",
  "syn 2.0.90",
  "sync_wrapper 0.1.2",
+ "tikv-jemalloc-ctl",
  "tikv-jemalloc-sys",
  "time",
  "time-macros",
diff --git a/Cargo.toml b/Cargo.toml
index 036dc01057..91fa6a2607 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -115,6 +115,7 @@ indoc = "2"
 ipnet = "2.10.0"
 itertools = "0.10"
 itoa = "1.0.11"
+jemalloc_pprof = "0.6"
 jsonwebtoken = "9"
 lasso = "0.7"
 libc = "0.2"
@@ -175,7 +176,7 @@ sync_wrapper = "0.1.2"
 tar = "0.4"
 test-context = "0.3"
 thiserror = "1.0"
-tikv-jemallocator = { version = "0.6", features = ["stats"] }
+tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 5648072a83..66500fb141 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -26,6 +26,7 @@ humantime.workspace = true
 hyper0 = { workspace = true, features = ["full"] }
 fail.workspace = true
 futures = { workspace = true}
+jemalloc_pprof.workspace = true
 jsonwebtoken.workspace = true
 nix.workspace = true
 once_cell.workspace = true
diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 6a85f0ddeb..d975b63677 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -10,6 +10,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
+use tokio_util::io::ReaderStream;
 use tracing::{debug, info, info_span, warn, Instrument};
 
 use std::future::Future;
@@ -407,6 +408,69 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
     }
 }
 
+/// Generates heap profiles.
+///
+/// This only works with jemalloc on Linux.
+pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    enum Format {
+        Jemalloc,
+        Pprof,
+    }
+
+    // Parameters.
+    let format = match get_query_param(&req, "format")?.as_deref() {
+        None => Format::Pprof,
+        Some("jemalloc") => Format::Jemalloc,
+        Some("pprof") => Format::Pprof,
+        Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
+    };
+
+    // Obtain profiler handle.
+    let mut prof_ctl = jemalloc_pprof::PROF_CTL
+        .as_ref()
+        .ok_or(ApiError::InternalServerError(anyhow!(
+            "heap profiling not enabled"
+        )))?
+        .lock()
+        .await;
+    if !prof_ctl.activated() {
+        return Err(ApiError::InternalServerError(anyhow!(
+            "heap profiling not enabled"
+        )));
+    }
+
+    // Take and return the profile.
+    match format {
+        Format::Jemalloc => {
+            // NB: file is an open handle to a tempfile that's already deleted.
+            let file = tokio::task::spawn_blocking(move || prof_ctl.dump())
+                .await
+                .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
+                .map_err(ApiError::InternalServerError)?;
+            let stream = ReaderStream::new(tokio::fs::File::from_std(file));
+            Response::builder()
+                .status(200)
+                .header(CONTENT_TYPE, "application/octet-stream")
+                .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.dump\"")
+                .body(Body::wrap_stream(stream))
+                .map_err(|err| ApiError::InternalServerError(err.into()))
+        }
+
+        Format::Pprof => {
+            let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof())
+                .await
+                .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
+                .map_err(ApiError::InternalServerError)?;
+            Response::builder()
+                .status(200)
+                .header(CONTENT_TYPE, "application/octet-stream")
+                .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb\"")
+                .body(Body::from(data))
+                .map_err(|err| ApiError::InternalServerError(err.into()))
+        }
+    }
+}
+
 pub fn add_request_id_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
 ) -> Middleware<B, ApiError> {
     Middleware::pre(move |req| async move {
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 31f4370855..8fe225c6aa 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -53,6 +53,11 @@ project_build_tag!(BUILD_TAG);
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
+/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+
 const PID_FILE_NAME: &str = "pageserver.pid";
 
 const FEATURES: &[&str] = &[
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index ceb1c3b012..e127871549 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -56,9 +56,9 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
-use utils::http::endpoint::profile_cpu_handler;
-use utils::http::endpoint::prometheus_metrics_handler;
-use utils::http::endpoint::request_span;
+use utils::http::endpoint::{
+    profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span,
+};
 use utils::http::request::must_parse_query_param;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
 
@@ -155,6 +155,7 @@ impl State {
             "/swagger.yml",
             "/metrics",
             "/profile/cpu",
+            "/profile/heap",
         ];
         Ok(Self {
             conf,
@@ -3203,6 +3204,7 @@ pub fn make_router(
         .data(state)
         .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
         .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
+        .get("/profile/heap", |r| request_span(r, profile_heap_handler))
         .get("/v1/status", |r| api_handler(r, status_handler))
         .put("/v1/failpoints", |r| {
             testing_api_handler("manage failpoints", r, failpoints_handler)
diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs
index 8c4281cf52..313d945b94 100644
--- a/safekeeper/benches/receive_wal.rs
+++ b/safekeeper/benches/receive_wal.rs
@@ -24,9 +24,15 @@ const KB: usize = 1024;
 const MB: usize = 1024 * KB;
 const GB: usize = 1024 * MB;
 
+/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB.
+/// This mirrors the configuration in bin/safekeeper.rs.
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+
 // Register benchmarks with Criterion.
 criterion_group!(
     name = benches;
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 3659bcd7e0..4dc7edef37 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -51,6 +51,11 @@ use utils::{
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
+/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+
 const PID_FILE_NAME: &str = "safekeeper.pid";
 const ID_FILE_NAME: &str = "safekeeper.id";
 
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 28294abdb9..69b775fd76 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -14,7 +14,8 @@ use tokio_util::sync::CancellationToken;
 use tracing::{info_span, Instrument};
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::{
-    profile_cpu_handler, prometheus_metrics_handler, request_span, ChannelWriter,
+    profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span,
+    ChannelWriter,
 };
 use utils::http::request::parse_query_param;
 
@@ -573,7 +574,8 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
     let mut router = endpoint::make_router();
     if conf.http_auth.is_some() {
         router = router.middleware(auth_middleware(|request| {
-            const ALLOWLIST_ROUTES: &[&str] = &["/v1/status", "/metrics", "/profile/cpu"];
+            const ALLOWLIST_ROUTES: &[&str] =
+                &["/v1/status", "/metrics", "/profile/cpu", "profile/heap"];
             if ALLOWLIST_ROUTES.contains(&request.uri().path()) {
                 None
             } else {
@@ -594,6 +596,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         .data(auth)
         .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
         .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
+        .get("/profile/heap", |r| request_span(r, profile_heap_handler))
         .get("/v1/status", |r| request_span(r, status_handler))
         .put("/v1/failpoints", |r| {
             request_span(r, move |r| async {
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index c0a3abc377..d19379aefd 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -55,12 +55,16 @@ log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nix = { version = "0.26" }
 nom = { version = "7" }
+num = { version = "0.4" }
 num-bigint = { version = "0.4" }
+num-complex = { version = "0.4", default-features = false, features = ["std"] }
 num-integer = { version = "0.1", features = ["i128"] }
+num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] }
+num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { version = "53", default-features = false, features = ["zstd"] }
-prost = { version = "0.13", features = ["prost-derive"] }
+prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
@@ -76,7 +80,8 @@ smallvec = { version = "1", default-features = false, features = ["const_new", "
 spki = { version = "0.7", default-features = false, features = ["pem", "std"] }
 subtle = { version = "2" }
 sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
-tikv-jemalloc-sys = { version = "0.6", features = ["stats"] }
+tikv-jemalloc-ctl = { version = "0.6", features = ["stats", "use_std"] }
+tikv-jemalloc-sys = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["full", "test-util"] }
 tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
@@ -111,14 +116,18 @@ libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
+num = { version = "0.4" }
 num-bigint = { version = "0.4" }
+num-complex = { version = "0.4", default-features = false, features = ["std"] }
 num-integer = { version = "0.1", features = ["i128"] }
+num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] }
+num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { version = "53", default-features = false, features = ["zstd"] }
 prettyplease = { version = "0.2", default-features = false, features = ["verbatim"] }
 proc-macro2 = { version = "1" }
-prost = { version = "0.13", features = ["prost-derive"] }
+prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] }
 quote = { version = "1" }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }

From 907e4aa3c4d670960ca17429f4b83aa04ff8ff45 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 3 Dec 2024 15:33:31 +0100
Subject: [PATCH 168/209] test_runner: use immediate shutdown in
 `test_sharded_ingest` (#9984)

## Problem

`test_sharded_ingest` ingests a lot of data, which can cause shutdown to
be slow e.g. due to local "S3 uploads" or compactions. This can cause
test flakes during teardown.

Resolves #9740.

## Summary of changes

Perform an immediate shutdown of the cluster.
---
 test_runner/performance/test_sharded_ingest.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test_runner/performance/test_sharded_ingest.py b/test_runner/performance/test_sharded_ingest.py
index 4c21e799c8..94fd54bade 100644
--- a/test_runner/performance/test_sharded_ingest.py
+++ b/test_runner/performance/test_sharded_ingest.py
@@ -90,6 +90,7 @@ def test_sharded_ingest(
     # Start the endpoint.
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
     start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+
     # Ingest data and measure WAL volume and duration.
     with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
@@ -104,6 +105,8 @@ def test_sharded_ingest(
                 wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+
+    # Record metrics.
     wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))
     zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
 
@@ -152,3 +155,7 @@ def test_sharded_ingest(
     log.info(f"WAL ingested by each pageserver {ingested_by_ps}")
 
     assert tenant_get_shards(env, tenant_id) == shards, "shards moved"
+
+    # The pageservers can take a long time to shut down gracefully, presumably due to the upload
+    # queue or compactions or something. Just stop them immediately, we don't care.
+    env.stop(immediate=True)

From 63cb8ce975c26121bf9f36601649096a775fbbbd Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 3 Dec 2024 16:25:58 +0100
Subject: [PATCH 169/209] pageserver: only throttle pagestream requests & bring
 back throttling deduction for smgr latency metrics (#9962)

## Problem

In the batching PR
- https://github.com/neondatabase/neon/pull/9870

I stopped deducting the time-spent-in-throttle fro latency metrics,
i.e.,
- smgr latency metrics (`SmgrOpTimer`)
- basebackup latency (+scan latency, which I think is part of
basebackup).

The reason for stopping the deduction was that with the introduction of
batching, the trick with tracking time-spent-in-throttle inside
RequestContext and swap-replacing it from the `impl Drop for
SmgrOpTimer` no longer worked with >1 requests in a batch.

However, deducting time-spent-in-throttle is desirable because our
internal latency SLO definition does not account for throttling.

## Summary of changes

- Redefine throttling to be a page_service pagestream request throttle
instead of a throttle for repository `Key` reads through `Timeline::get`
/ `Timeline::get_vectored`.
- This means reads done by `basebackup` are no longer subject to any
throttle.
- The throttle applies after batching, before handling of the request.
- Drive-by fix: make throttle sensitive to cancellation.
- Rename metric label `kind` from `timeline_get` to `pagestream` to
reflect the new scope of throttling.

To avoid config format breakage, we leave the config field named
`timeline_get_throttle` and ignore the `task_kinds` field.
This will be cleaned up in a future PR.

## Trade-Offs

Ideally, we would apply the throttle before reading a request off the
connection, so that we queue the minimal amount of work inside the
process.
However, that's not possible because we need to do shard routing.

The redefinition of the throttle to limit pagestream request rate
instead of repository `Key` rate comes with several downsides:
- We're no longer able to use the throttle mechanism for other other
tasks, e.g. image layer creation.
  However, in practice, we never used that capability anyways.
- We no longer throttle basebackup.
---
 libs/pageserver_api/src/models.rs             | 58 ++++++++++-
 pageserver/src/metrics.rs                     | 95 ++++++++++++-------
 pageserver/src/page_service.rs                | 45 ++++++++-
 pageserver/src/tenant.rs                      | 20 ++--
 pageserver/src/tenant/tasks.rs                |  6 +-
 pageserver/src/tenant/throttle.rs             | 33 ++-----
 pageserver/src/tenant/timeline.rs             | 54 ++---------
 pageserver/src/tenant/timeline/delete.rs      |  2 +-
 .../test_pageserver_getpage_throttle.py       | 16 ++--
 9 files changed, 198 insertions(+), 131 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 42c5d10c05..5488f7b2c2 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -501,7 +501,9 @@ pub struct EvictionPolicyLayerAccessThreshold {
 
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
 pub struct ThrottleConfig {
-    pub task_kinds: Vec<String>, // TaskKind
+    /// See [`ThrottleConfigTaskKinds`] for why we do the serde `rename`.
+    #[serde(rename = "task_kinds")]
+    pub enabled: ThrottleConfigTaskKinds,
     pub initial: u32,
     #[serde(with = "humantime_serde")]
     pub refill_interval: Duration,
@@ -509,10 +511,38 @@ pub struct ThrottleConfig {
     pub max: u32,
 }
 
+/// Before <https://github.com/neondatabase/neon/pull/9962>
+/// the throttle was a per `Timeline::get`/`Timeline::get_vectored` call.
+/// The `task_kinds` field controlled which Pageserver "Task Kind"s
+/// were subject to the throttle.
+///
+/// After that PR, the throttle is applied at pagestream request level
+/// and the `task_kinds` field does not apply since the only task kind
+/// that us subject to the throttle is that of the page service.
+///
+/// However, we don't want to make a breaking config change right now
+/// because it means we have to migrate all the tenant configs.
+/// This will be done in a future PR.
+///
+/// In the meantime, we use emptiness / non-emptsiness of the `task_kinds`
+/// field to determine if the throttle is enabled or not.
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+#[serde(transparent)]
+pub struct ThrottleConfigTaskKinds(Vec<String>);
+
+impl ThrottleConfigTaskKinds {
+    pub fn disabled() -> Self {
+        Self(vec![])
+    }
+    pub fn is_enabled(&self) -> bool {
+        !self.0.is_empty()
+    }
+}
+
 impl ThrottleConfig {
     pub fn disabled() -> Self {
         Self {
-            task_kinds: vec![], // effectively disables the throttle
+            enabled: ThrottleConfigTaskKinds::disabled(),
             // other values don't matter with emtpy `task_kinds`.
             initial: 0,
             refill_interval: Duration::from_millis(1),
@@ -526,6 +556,30 @@ impl ThrottleConfig {
     }
 }
 
+#[cfg(test)]
+mod throttle_config_tests {
+    use super::*;
+
+    #[test]
+    fn test_disabled_is_disabled() {
+        let config = ThrottleConfig::disabled();
+        assert!(!config.enabled.is_enabled());
+    }
+    #[test]
+    fn test_enabled_backwards_compat() {
+        let input = serde_json::json!({
+            "task_kinds": ["PageRequestHandler"],
+            "initial": 40000,
+            "refill_interval": "50ms",
+            "refill_amount": 1000,
+            "max": 40000,
+            "fair": true
+        });
+        let config: ThrottleConfig = serde_json::from_value(input).unwrap();
+        assert!(config.enabled.is_enabled());
+    }
+}
+
 /// A flattened analog of a `pagesever::tenant::LocationMode`, which
 /// lists out all possible states (and the virtual "Detached" state)
 /// in a flat form rather than using rust-style enums.
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index d04fae7627..998c15ccaf 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -217,31 +217,16 @@ impl<'a> ScanLatencyOngoingRecording<'a> {
         ScanLatencyOngoingRecording { parent, start }
     }
 
-    pub(crate) fn observe(self, throttled: Option<Duration>) {
+    pub(crate) fn observe(self) {
         let elapsed = self.start.elapsed();
-        let ex_throttled = if let Some(throttled) = throttled {
-            elapsed.checked_sub(throttled)
-        } else {
-            Some(elapsed)
-        };
-        if let Some(ex_throttled) = ex_throttled {
-            self.parent.observe(ex_throttled.as_secs_f64());
-        } else {
-            use utils::rate_limit::RateLimit;
-            static LOGGED: Lazy<Mutex<RateLimit>> =
-                Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-            let mut rate_limit = LOGGED.lock().unwrap();
-            rate_limit.call(|| {
-                warn!("error deducting time spent throttled; this message is logged at a global rate limit");
-            });
-        }
+        self.parent.observe(elapsed.as_secs_f64());
     }
 }
 
 pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
     let inner = register_histogram_vec!(
         "pageserver_get_vectored_seconds",
-        "Time spent in get_vectored, excluding time spent in timeline_get_throttle.",
+        "Time spent in get_vectored.",
         &["task_kind"],
         CRITICAL_OP_BUCKETS.into(),
     )
@@ -264,7 +249,7 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(||
 pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
     let inner = register_histogram_vec!(
         "pageserver_scan_seconds",
-        "Time spent in scan, excluding time spent in timeline_get_throttle.",
+        "Time spent in scan.",
         &["task_kind"],
         CRITICAL_OP_BUCKETS.into(),
     )
@@ -1227,11 +1212,44 @@ pub(crate) struct SmgrOpTimer {
     per_timeline_latency_histo: Option<Histogram>,
 
     start: Instant,
+    throttled: Duration,
+    op: SmgrQueryType,
+}
+
+impl SmgrOpTimer {
+    pub(crate) fn deduct_throttle(&mut self, throttle: &Option<Duration>) {
+        let Some(throttle) = throttle else {
+            return;
+        };
+        self.throttled += *throttle;
+    }
 }
 
 impl Drop for SmgrOpTimer {
     fn drop(&mut self) {
-        let elapsed = self.start.elapsed().as_secs_f64();
+        let elapsed = self.start.elapsed();
+
+        let elapsed = match elapsed.checked_sub(self.throttled) {
+            Some(elapsed) => elapsed,
+            None => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
+                    Lazy::new(|| {
+                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
+                            RateLimit::new(Duration::from_secs(10))
+                        })))
+                    });
+                let mut guard = LOGGED.lock().unwrap();
+                let rate_limit = &mut guard[self.op];
+                rate_limit.call(|| {
+                    warn!(op=?self.op, ?elapsed, ?self.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
+                });
+                elapsed // un-throttled time, more info than just saturating to 0
+            }
+        };
+
+        let elapsed = elapsed.as_secs_f64();
+
         self.global_latency_histo.observe(elapsed);
         if let Some(per_timeline_getpage_histo) = &self.per_timeline_latency_histo {
             per_timeline_getpage_histo.observe(elapsed);
@@ -1491,6 +1509,8 @@ impl SmgrQueryTimePerTimeline {
             global_latency_histo: self.global_latency[op as usize].clone(),
             per_timeline_latency_histo,
             start: started_at,
+            op,
+            throttled: Duration::ZERO,
         }
     }
 
@@ -3299,7 +3319,7 @@ pub(crate) mod tenant_throttling {
     use once_cell::sync::Lazy;
     use utils::shard::TenantShardId;
 
-    use crate::tenant::{self, throttle::Metric};
+    use crate::tenant::{self};
 
     struct GlobalAndPerTenantIntCounter {
         global: IntCounter,
@@ -3318,7 +3338,7 @@ pub(crate) mod tenant_throttling {
         }
     }
 
-    pub(crate) struct TimelineGet {
+    pub(crate) struct Metrics<const KIND: usize> {
         count_accounted_start: GlobalAndPerTenantIntCounter,
         count_accounted_finish: GlobalAndPerTenantIntCounter,
         wait_time: GlobalAndPerTenantIntCounter,
@@ -3391,40 +3411,41 @@ pub(crate) mod tenant_throttling {
         .unwrap()
     });
 
-    const KIND: &str = "timeline_get";
+    const KINDS: &[&str] = &["pagestream"];
+    pub type Pagestream = Metrics<0>;
 
-    impl TimelineGet {
+    impl<const KIND: usize> Metrics<KIND> {
         pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
             let per_tenant_label_values = &[
-                KIND,
+                KINDS[KIND],
                 &tenant_shard_id.tenant_id.to_string(),
                 &tenant_shard_id.shard_slug().to_string(),
             ];
-            TimelineGet {
+            Metrics {
                 count_accounted_start: {
                     GlobalAndPerTenantIntCounter {
-                        global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]),
+                        global: COUNT_ACCOUNTED_START.with_label_values(&[KINDS[KIND]]),
                         per_tenant: COUNT_ACCOUNTED_START_PER_TENANT
                             .with_label_values(per_tenant_label_values),
                     }
                 },
                 count_accounted_finish: {
                     GlobalAndPerTenantIntCounter {
-                        global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]),
+                        global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KINDS[KIND]]),
                         per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT
                             .with_label_values(per_tenant_label_values),
                     }
                 },
                 wait_time: {
                     GlobalAndPerTenantIntCounter {
-                        global: WAIT_USECS.with_label_values(&[KIND]),
+                        global: WAIT_USECS.with_label_values(&[KINDS[KIND]]),
                         per_tenant: WAIT_USECS_PER_TENANT
                             .with_label_values(per_tenant_label_values),
                     }
                 },
                 count_throttled: {
                     GlobalAndPerTenantIntCounter {
-                        global: WAIT_COUNT.with_label_values(&[KIND]),
+                        global: WAIT_COUNT.with_label_values(&[KINDS[KIND]]),
                         per_tenant: WAIT_COUNT_PER_TENANT
                             .with_label_values(per_tenant_label_values),
                     }
@@ -3447,15 +3468,17 @@ pub(crate) mod tenant_throttling {
             &WAIT_USECS_PER_TENANT,
             &WAIT_COUNT_PER_TENANT,
         ] {
-            let _ = m.remove_label_values(&[
-                KIND,
-                &tenant_shard_id.tenant_id.to_string(),
-                &tenant_shard_id.shard_slug().to_string(),
-            ]);
+            for kind in KINDS {
+                let _ = m.remove_label_values(&[
+                    kind,
+                    &tenant_shard_id.tenant_id.to_string(),
+                    &tenant_shard_id.shard_slug().to_string(),
+                ]);
+            }
         }
     }
 
-    impl Metric for TimelineGet {
+    impl<const KIND: usize> tenant::throttle::Metric for Metrics<KIND> {
         #[inline(always)]
         fn accounting_start(&self) {
             self.count_accounted_start.inc();
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 64842aa5b8..7026df9527 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -574,6 +574,41 @@ enum BatchedFeMessage {
     },
 }
 
+impl BatchedFeMessage {
+    async fn throttle(&mut self, cancel: &CancellationToken) -> Result<(), QueryError> {
+        let (shard, tokens, timers) = match self {
+            BatchedFeMessage::Exists { shard, timer, .. }
+            | BatchedFeMessage::Nblocks { shard, timer, .. }
+            | BatchedFeMessage::DbSize { shard, timer, .. }
+            | BatchedFeMessage::GetSlruSegment { shard, timer, .. } => {
+                (
+                    shard,
+                    // 1 token is probably under-estimating because these
+                    // request handlers typically do several Timeline::get calls.
+                    1,
+                    itertools::Either::Left(std::iter::once(timer)),
+                )
+            }
+            BatchedFeMessage::GetPage { shard, pages, .. } => (
+                shard,
+                pages.len(),
+                itertools::Either::Right(pages.iter_mut().map(|(_, _, timer)| timer)),
+            ),
+            BatchedFeMessage::RespondError { .. } => return Ok(()),
+        };
+        let throttled = tokio::select! {
+            throttled = shard.pagestream_throttle.throttle(tokens) => { throttled }
+            _ = cancel.cancelled() => {
+                return Err(QueryError::Shutdown);
+            }
+        };
+        for timer in timers {
+            timer.deduct_throttle(&throttled);
+        }
+        Ok(())
+    }
+}
+
 impl PageServerHandler {
     pub fn new(
         tenant_manager: Arc<TenantManager>,
@@ -1157,13 +1192,18 @@ impl PageServerHandler {
                 Ok(msg) => msg,
                 Err(e) => break e,
             };
-            let msg = match msg {
+            let mut msg = match msg {
                 Some(msg) => msg,
                 None => {
                     debug!("pagestream subprotocol end observed");
                     return ((pgb_reader, timeline_handles), Ok(()));
                 }
             };
+
+            if let Err(cancelled) = msg.throttle(&self.cancel).await {
+                break cancelled;
+            }
+
             let err = self
                 .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, ctx)
                 .await;
@@ -1321,12 +1361,13 @@ impl PageServerHandler {
                             return Ok(());
                         }
                     };
-                    let batch = match batch {
+                    let mut batch = match batch {
                         Ok(batch) => batch,
                         Err(e) => {
                             return Err(e);
                         }
                     };
+                    batch.throttle(&self.cancel).await?;
                     self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
                         .await?;
                 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index cd0690bb1a..ada5c4a977 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -357,8 +357,8 @@ pub struct Tenant {
 
     /// Throttle applied at the top of [`Timeline::get`].
     /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
-    pub(crate) timeline_get_throttle:
-        Arc<throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
+    pub(crate) pagestream_throttle:
+        Arc<throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
 
     /// An ongoing timeline detach concurrency limiter.
     ///
@@ -1678,7 +1678,7 @@ impl Tenant {
                     remote_metadata,
                     TimelineResources {
                         remote_client,
-                        timeline_get_throttle: self.timeline_get_throttle.clone(),
+                        pagestream_throttle: self.pagestream_throttle.clone(),
                         l0_flush_global_state: self.l0_flush_global_state.clone(),
                     },
                     LoadTimelineCause::Attach,
@@ -3835,7 +3835,7 @@ impl Tenant {
         }
     }
 
-    fn get_timeline_get_throttle_config(
+    fn get_pagestream_throttle_config(
         psconf: &'static PageServerConf,
         overrides: &TenantConfOpt,
     ) -> throttle::Config {
@@ -3846,8 +3846,8 @@ impl Tenant {
     }
 
     pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
-        let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
-        self.timeline_get_throttle.reconfigure(conf)
+        let conf = Self::get_pagestream_throttle_config(self.conf, new_conf);
+        self.pagestream_throttle.reconfigure(conf)
     }
 
     /// Helper function to create a new Timeline struct.
@@ -4009,9 +4009,9 @@ impl Tenant {
             attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
             cancel: CancellationToken::default(),
             gate: Gate::default(),
-            timeline_get_throttle: Arc::new(throttle::Throttle::new(
-                Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
-                crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id),
+            pagestream_throttle: Arc::new(throttle::Throttle::new(
+                Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
+                crate::metrics::tenant_throttling::Metrics::new(&tenant_shard_id),
             )),
             tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
             ongoing_timeline_detach: std::sync::Mutex::default(),
@@ -4909,7 +4909,7 @@ impl Tenant {
     fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
         TimelineResources {
             remote_client: self.build_timeline_remote_client(timeline_id),
-            timeline_get_throttle: self.timeline_get_throttle.clone(),
+            pagestream_throttle: self.pagestream_throttle.clone(),
             l0_flush_global_state: self.l0_flush_global_state.clone(),
         }
     }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 16dac10dca..0118a5ce5f 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -471,14 +471,14 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
 
             // TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
             // Or just spawn another background loop for this throttle, it's not like it's super costly.
-            info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
+            info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
                 let now = Instant::now();
                 let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
-                let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.timeline_get_throttle.reset_stats();
+                let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats();
                 if count_throttled == 0 {
                     return;
                 }
-                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
+                let allowed_rps = tenant.pagestream_throttle.steady_rps();
                 let delta = now - prev;
                 info!(
                     n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index 7c4de55a47..54c0e59daa 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -1,5 +1,4 @@
 use std::{
-    str::FromStr,
     sync::{
         atomic::{AtomicU64, Ordering},
         Arc,
@@ -8,12 +7,8 @@ use std::{
 };
 
 use arc_swap::ArcSwap;
-use enumset::EnumSet;
-use tracing::error;
 use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter};
 
-use crate::{context::RequestContext, task_mgr::TaskKind};
-
 /// Throttle for `async` functions.
 ///
 /// Runtime reconfigurable.
@@ -35,7 +30,7 @@ pub struct Throttle<M: Metric> {
 }
 
 pub struct Inner {
-    task_kinds: EnumSet<TaskKind>,
+    enabled: bool,
     rate_limiter: Arc<RateLimiter>,
 }
 
@@ -79,26 +74,12 @@ where
     }
     fn new_inner(config: Config) -> Inner {
         let Config {
-            task_kinds,
+            enabled,
             initial,
             refill_interval,
             refill_amount,
             max,
         } = config;
-        let task_kinds: EnumSet<TaskKind> = task_kinds
-            .iter()
-            .filter_map(|s| match TaskKind::from_str(s) {
-                Ok(v) => Some(v),
-                Err(e) => {
-                    // TODO: avoid this failure mode
-                    error!(
-                        "cannot parse task kind, ignoring for rate limiting {}",
-                        utils::error::report_compact_sources(&e)
-                    );
-                    None
-                }
-            })
-            .collect();
 
         // steady rate, we expect `refill_amount` requests per `refill_interval`.
         // dividing gives us the rps.
@@ -112,7 +93,7 @@ where
         let rate_limiter = RateLimiter::with_initial_tokens(config, f64::from(initial_tokens));
 
         Inner {
-            task_kinds,
+            enabled: enabled.is_enabled(),
             rate_limiter: Arc::new(rate_limiter),
         }
     }
@@ -141,11 +122,13 @@ where
         self.inner.load().rate_limiter.steady_rps()
     }
 
-    pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option<Duration> {
+    pub async fn throttle(&self, key_count: usize) -> Option<Duration> {
         let inner = self.inner.load_full(); // clones the `Inner` Arc
-        if !inner.task_kinds.contains(ctx.task_kind()) {
+
+        if !inner.enabled {
             return None;
-        };
+        }
+
         let start = std::time::Instant::now();
 
         self.metric.accounting_start();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index dc3f823f20..1414bef0a5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -208,8 +208,8 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
     pub remote_client: RemoteTimelineClient,
-    pub timeline_get_throttle:
-        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
+    pub pagestream_throttle:
+        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
     pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }
 
@@ -411,9 +411,9 @@ pub struct Timeline {
     /// Timeline deletion will acquire both compaction and gc locks in whatever order.
     gc_lock: tokio::sync::Mutex<()>,
 
-    /// Cloned from [`super::Tenant::timeline_get_throttle`] on construction.
-    timeline_get_throttle:
-        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
+    /// Cloned from [`super::Tenant::pagestream_throttle`] on construction.
+    pub(crate) pagestream_throttle:
+        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
 
     /// Size estimator for aux file v2
     pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
@@ -949,7 +949,7 @@ impl Timeline {
     /// If a remote layer file is needed, it is downloaded as part of this
     /// call.
     ///
-    /// This method enforces [`Self::timeline_get_throttle`] internally.
+    /// This method enforces [`Self::pagestream_throttle`] internally.
     ///
     /// NOTE: It is considered an error to 'get' a key that doesn't exist. The
     /// abstraction above this needs to store suitable metadata to track what
@@ -977,8 +977,6 @@ impl Timeline {
         // page_service.
         debug_assert!(!self.shard_identity.is_key_disposable(&key));
 
-        self.timeline_get_throttle.throttle(ctx, 1).await;
-
         let keyspace = KeySpace {
             ranges: vec![key..key.next()],
         };
@@ -1058,14 +1056,6 @@ impl Timeline {
             .for_task_kind(ctx.task_kind())
             .map(|metric| (metric, Instant::now()));
 
-        // start counting after throttle so that throttle time
-        // is always less than observation time and we don't
-        // underflow when computing `ex_throttled` below.
-        let throttled = self
-            .timeline_get_throttle
-            .throttle(ctx, key_count as usize)
-            .await;
-
         let res = self
             .get_vectored_impl(
                 keyspace.clone(),
@@ -1077,23 +1067,7 @@ impl Timeline {
 
         if let Some((metric, start)) = start {
             let elapsed = start.elapsed();
-            let ex_throttled = if let Some(throttled) = throttled {
-                elapsed.checked_sub(throttled)
-            } else {
-                Some(elapsed)
-            };
-
-            if let Some(ex_throttled) = ex_throttled {
-                metric.observe(ex_throttled.as_secs_f64());
-            } else {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<RateLimit>> =
-                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-                let mut rate_limit = LOGGED.lock().unwrap();
-                rate_limit.call(|| {
-                    warn!("error deducting time spent throttled; this message is logged at a global rate limit");
-                });
-            }
+            metric.observe(elapsed.as_secs_f64());
         }
 
         res
@@ -1138,16 +1112,6 @@ impl Timeline {
             .for_task_kind(ctx.task_kind())
             .map(ScanLatencyOngoingRecording::start_recording);
 
-        // start counting after throttle so that throttle time
-        // is always less than observation time and we don't
-        // underflow when computing the `ex_throttled` value in
-        // `recording.observe(throttled)` below.
-        let throttled = self
-            .timeline_get_throttle
-            // assume scan = 1 quota for now until we find a better way to process this
-            .throttle(ctx, 1)
-            .await;
-
         let vectored_res = self
             .get_vectored_impl(
                 keyspace.clone(),
@@ -1158,7 +1122,7 @@ impl Timeline {
             .await;
 
         if let Some(recording) = start {
-            recording.observe(throttled);
+            recording.observe();
         }
 
         vectored_res
@@ -2374,7 +2338,7 @@ impl Timeline {
 
                 standby_horizon: AtomicLsn::new(0),
 
-                timeline_get_throttle: resources.timeline_get_throttle,
+                pagestream_throttle: resources.pagestream_throttle,
 
                 aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
 
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 67fc710c44..47a93b19d2 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -298,7 +298,7 @@ impl DeleteTimelineFlow {
                 None, // Ancestor is not needed for deletion.
                 TimelineResources {
                     remote_client,
-                    timeline_get_throttle: tenant.timeline_get_throttle.clone(),
+                    pagestream_throttle: tenant.pagestream_throttle.clone(),
                     l0_flush_global_state: tenant.l0_flush_global_state.clone(),
                 },
                 // Important. We dont pass ancestor above because it can be missing.
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index 62aec50a9e..6d0661f068 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -33,7 +33,9 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
         conf={
             "compaction_period": f"{compaction_period}s",
             "timeline_get_throttle": {
-                "task_kinds": ["PageRequestHandler"],
+                "task_kinds": [
+                    "PageRequestHandler"
+                ],  # any non-empty array will do here https://github.com/neondatabase/neon/pull/9962
                 "initial": 0,
                 "refill_interval": "100ms",
                 "refill_amount": int(rate_limit_rps / 10),
@@ -116,7 +118,6 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
         timeout=compaction_period,
     )
 
-    log.info("the smgr metric includes throttle time")
     smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
     assert smgr_query_seconds_post is not None
     throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
@@ -125,13 +126,14 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
     actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre
     actual_throttled_secs = actual_throttled_usecs / 1_000_000
 
+    log.info("validate that the metric doesn't include throttle wait time")
     assert (
-        pytest.approx(duration_secs, 0.1) == actual_smgr_query_seconds
-    ), "smgr metrics include throttle wait time"
-    smgr_ex_throttle = actual_smgr_query_seconds - actual_throttled_secs
-    assert smgr_ex_throttle > 0
+        duration_secs >= 10 * actual_smgr_query_seconds
+    ), "smgr metrics should not include throttle wait time"
+
+    log.info("validate that the throttling wait time metrics is correct")
     assert (
-        duration_secs > 10 * smgr_ex_throttle
+        pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs
     ), "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"
 
 
From 69c0d61c5c319a4f2a8fb0d3613af5020ca03c43 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 3 Dec 2024 16:55:00 +0000
Subject: [PATCH 170/209] storcon: in shard splits, inherit parent's AZ (#9946)

## Problem

Sharded tenants should be run in a single AZ for best performance, so
that computes have AZ-local latency to all the shards.

Part of https://github.com/neondatabase/neon/issues/8264

## Summary of changes

- When we split a tenant, instead of updating each shard's preferred AZ
to wherever it is scheduled, propagate the preferred AZ from the parent.
- Drop the check in `test_shard_preferred_azs` that asserts shards end
up in their preferred AZ: this will not be true again until the
optimize_attachment logic is updated to make this so. The existing check
wasn't testing anything about scheduling, it was just asserting that we
set preferred AZ in a way that matches the way things happen to be
scheduled at time of split.
---
 storage_controller/src/service.rs             | 68 ++++++-------------
 .../regress/test_storage_controller.py        |  6 +-
 2 files changed, 24 insertions(+), 50 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 52c9c4710d..741d3dc2b4 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -44,12 +44,12 @@ use futures::{stream::FuturesUnordered, StreamExt};
 use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
-        MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
-        NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, ShardSchedulingPolicy,
-        ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, TenantCreateRequest,
-        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
-        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
+        AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability,
+        NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy,
+        ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
+        TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard,
+        TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse,
+        TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
     },
     models::{
         SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
@@ -468,6 +468,7 @@ struct ShardSplitParams {
     policy: PlacementPolicy,
     config: TenantConfig,
     shard_ident: ShardIdentity,
+    preferred_az_id: Option<AvailabilityZone>,
 }
 
 // When preparing for a shard split, we may either choose to proceed with the split,
@@ -4103,7 +4104,7 @@ impl Service {
             for parent_id in parent_ids {
                 let child_ids = parent_id.split(new_shard_count);
 
-                let (pageserver, generation, policy, parent_ident, config) = {
+                let (pageserver, generation, policy, parent_ident, config, preferred_az) = {
                     let mut old_state = tenants
                         .remove(&parent_id)
                         .expect("It was present, we just split it");
@@ -4122,6 +4123,7 @@ impl Service {
                         old_state.policy.clone(),
                         old_state.shard,
                         old_state.config.clone(),
+                        old_state.preferred_az().cloned(),
                     )
                 };
 
@@ -4154,6 +4156,9 @@ impl Service {
                     };
                     child_state.generation = Some(generation);
                     child_state.config = config.clone();
+                    if let Some(preferred_az) = &preferred_az {
+                        child_state.set_preferred_az(preferred_az.clone());
+                    }
 
                     // The child's TenantShard::splitting is intentionally left at the default value of Idle,
                     // as at this point in the split process we have succeeded and this part is infallible:
@@ -4346,6 +4351,7 @@ impl Service {
         let mut policy = None;
         let mut config = None;
         let mut shard_ident = None;
+        let mut preferred_az_id = None;
         // Validate input, and calculate which shards we will create
         let (old_shard_count, targets) =
             {
@@ -4404,6 +4410,9 @@ impl Service {
                     if config.is_none() {
                         config = Some(shard.config.clone());
                     }
+                    if preferred_az_id.is_none() {
+                        preferred_az_id = shard.preferred_az().cloned();
+                    }
 
                     if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
                         tracing::info!(
@@ -4474,6 +4483,7 @@ impl Service {
             policy,
             config,
             shard_ident,
+            preferred_az_id,
         })))
     }
 
@@ -4496,6 +4506,7 @@ impl Service {
             policy,
             config,
             shard_ident,
+            preferred_az_id,
         } = *params;
 
         // Drop any secondary locations: pageservers do not support splitting these, and in any case the
@@ -4569,7 +4580,7 @@ impl Service {
                     // Scheduling policies and preferred AZ do not carry through to children
                     scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                         .unwrap(),
-                    preferred_az_id: None,
+                    preferred_az_id: preferred_az_id.as_ref().map(|az| az.0.clone()),
                 });
             }
 
@@ -4689,47 +4700,6 @@ impl Service {
         let (response, child_locations, waiters) =
             self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);
 
-        // Now that we have scheduled the child shards, attempt to set their preferred AZ
-        // to that of the pageserver they've been attached on.
-        let preferred_azs = {
-            let locked = self.inner.read().unwrap();
-            child_locations
-                .iter()
-                .filter_map(|(tid, node_id, _stripe_size)| {
-                    let az_id = locked
-                        .nodes
-                        .get(node_id)
-                        .map(|n| n.get_availability_zone_id().clone())?;
-
-                    Some((*tid, az_id))
-                })
-                .collect::<Vec<_>>()
-        };
-
-        let updated = self
-            .persistence
-            .set_tenant_shard_preferred_azs(preferred_azs)
-            .await
-            .map_err(|err| {
-                ApiError::InternalServerError(anyhow::anyhow!(
-                    "Failed to persist preferred az ids: {err}"
-                ))
-            });
-
-        match updated {
-            Ok(updated) => {
-                let mut locked = self.inner.write().unwrap();
-                for (tid, az_id) in updated {
-                    if let Some(shard) = locked.tenants.get_mut(&tid) {
-                        shard.set_preferred_az(az_id);
-                    }
-                }
-            }
-            Err(err) => {
-                tracing::warn!("Failed to persist preferred AZs after split: {err}");
-            }
-        }
-
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
         for (child_id, child_ps, stripe_size) in child_locations {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 244893a616..f878116d53 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3057,7 +3057,11 @@ def test_shard_preferred_azs(neon_env_builder: NeonEnvBuilder):
     for shard in shards:
         attached_to = shard["node_attached"]
         expected_az = env.get_pageserver(attached_to).az_id
-        assert shard["preferred_az_id"] == expected_az
+
+        # The scheduling optimization logic is not yet AZ-aware, so doesn't succeed
+        # in putting the tenant shards in the preferred AZ.
+        # To be fixed in https://github.com/neondatabase/neon/pull/9916
+        # assert shard["preferred_az_id"] == expected_az
 
 
 @run_only_on_default_postgres("Postgres version makes no difference here")

From 41bb9c528009ff24ffe223cc2372722ee3054601 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 3 Dec 2024 17:22:49 +0000
Subject: [PATCH 171/209] pageserver: only store SLRUs & aux files on shard
 zero (#9786)

## Problem

Since https://github.com/neondatabase/neon/pull/9423 the non-zero shards
no longer need SLRU content in order to do GC. This data is now
redundant on shards >0.

One release cycle after merging that PR, we may merge this one, which
also stops writing those pages to shards > 0, reaping the efficiency
benefit.

Closes: https://github.com/neondatabase/neon/issues/7512
Closes: https://github.com/neondatabase/neon/issues/9641

## Summary of changes

- Avoid storing SLRUs on non-zero shards
- Bonus: avoid storing aux files on non-zero shards
---
 libs/pageserver_api/src/key.rs                |  5 ++
 libs/pageserver_api/src/shard.rs              | 34 +++++++++---
 libs/wal_decoder/src/decoder.rs               | 54 ++++++++++--------
 pageserver/src/import_datadir.rs              | 18 ++++--
 pageserver/src/pgdatadir_mapping.rs           | 55 ++++++++++++-------
 .../src/tenant/timeline/import_pgdata/flow.rs | 49 +++++++----------
 pageserver/src/walingest.rs                   |  4 ++
 7 files changed, 134 insertions(+), 85 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 523d143381..37dff6fe46 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -770,6 +770,11 @@ impl Key {
             && self.field6 == 1
     }
 
+    #[inline(always)]
+    pub fn is_aux_file_key(&self) -> bool {
+        self.field1 == AUX_KEY_PREFIX
+    }
+
     /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
     #[inline(always)]
     pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index e83cf4c855..a5c94a82c1 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -170,19 +170,37 @@ impl ShardIdentity {
         }
     }
 
+    /// Return true if the key should be stored on all shards, not just one.
+    fn is_key_global(&self, key: &Key) -> bool {
+        if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() {
+            // Special keys that are only stored on shard 0
+            false
+        } else if key.is_rel_block_key() {
+            // Ordinary relation blocks are distributed across shards
+            false
+        } else if key.is_rel_size_key() {
+            // All shards maintain rel size keys (although only shard 0 is responsible for
+            // keeping it strictly accurate, other shards just reflect the highest block they've ingested)
+            true
+        } else {
+            // For everything else, we assume it must be kept everywhere, because ingest code
+            // might assume this -- this covers functionality where the ingest code has
+            // not (yet) been made fully shard aware.
+            true
+        }
+    }
+
     /// Return true if the key should be discarded if found in this shard's
     /// data store, e.g. during compaction after a split.
     ///
     /// Shards _may_ drop keys which return false here, but are not obliged to.
     pub fn is_key_disposable(&self, key: &Key) -> bool {
-        if key_is_shard0(key) {
-            // Q: Why can't we dispose of shard0 content if we're not shard 0?
-            // A1: because the WAL ingestion logic currently ingests some shard 0
-            //     content on all shards, even though it's only read on shard 0.  If we
-            //     dropped it, then subsequent WAL ingest to these keys would encounter
-            //     an error.
-            // A2: because key_is_shard0 also covers relation size keys, which are written
-            //     on all shards even though they're only maintained accurately on shard 0.
+        if self.count < ShardCount(2) {
+            // Fast path: unsharded tenant doesn't dispose of anything
+            return false;
+        }
+
+        if self.is_key_global(key) {
             false
         } else {
             !self.is_key_local(key)
diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs
index 36c4b19266..aa50c62911 100644
--- a/libs/wal_decoder/src/decoder.rs
+++ b/libs/wal_decoder/src/decoder.rs
@@ -112,30 +112,38 @@ impl MetadataRecord {
         };
 
         // Next, filter the metadata record by shard.
-
-        // Route VM page updates to the shards that own them. VM pages are stored in the VM fork
-        // of the main relation. These are sharded and managed just like regular relation pages.
-        // See: https://github.com/neondatabase/neon/issues/9855
-        if let Some(
-            MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
-            | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
-        ) = metadata_record
-        {
-            let is_local_vm_page = |heap_blk| {
-                let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
-                shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
-            };
-            // Send the old and new VM page updates to their respective shards.
-            clear_vm_bits.old_heap_blkno = clear_vm_bits
-                .old_heap_blkno
-                .filter(|&blkno| is_local_vm_page(blkno));
-            clear_vm_bits.new_heap_blkno = clear_vm_bits
-                .new_heap_blkno
-                .filter(|&blkno| is_local_vm_page(blkno));
-            // If neither VM page belongs to this shard, discard the record.
-            if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none() {
-                metadata_record = None
+        match metadata_record {
+            Some(
+                MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
+                | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
+            ) => {
+                // Route VM page updates to the shards that own them. VM pages are stored in the VM fork
+                // of the main relation. These are sharded and managed just like regular relation pages.
+                // See: https://github.com/neondatabase/neon/issues/9855
+                let is_local_vm_page = |heap_blk| {
+                    let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
+                    shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
+                };
+                // Send the old and new VM page updates to their respective shards.
+                clear_vm_bits.old_heap_blkno = clear_vm_bits
+                    .old_heap_blkno
+                    .filter(|&blkno| is_local_vm_page(blkno));
+                clear_vm_bits.new_heap_blkno = clear_vm_bits
+                    .new_heap_blkno
+                    .filter(|&blkno| is_local_vm_page(blkno));
+                // If neither VM page belongs to this shard, discard the record.
+                if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none()
+                {
+                    metadata_record = None
+                }
             }
+            Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => {
+                // Filter LogicalMessage records (AUX files) to only be stored on shard zero
+                if !shard.is_shard_zero() {
+                    metadata_record = None;
+                }
+            }
+            _ => {}
         }
 
         Ok(metadata_record)
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 06c4553e1c..c061714010 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -575,18 +575,24 @@ async fn import_file(
     } else if file_path.starts_with("pg_xact") {
         let slru = SlruKind::Clog;
 
-        import_slru(modification, slru, file_path, reader, len, ctx).await?;
-        debug!("imported clog slru");
+        if modification.tline.tenant_shard_id.is_shard_zero() {
+            import_slru(modification, slru, file_path, reader, len, ctx).await?;
+            debug!("imported clog slru");
+        }
     } else if file_path.starts_with("pg_multixact/offsets") {
         let slru = SlruKind::MultiXactOffsets;
 
-        import_slru(modification, slru, file_path, reader, len, ctx).await?;
-        debug!("imported multixact offsets slru");
+        if modification.tline.tenant_shard_id.is_shard_zero() {
+            import_slru(modification, slru, file_path, reader, len, ctx).await?;
+            debug!("imported multixact offsets slru");
+        }
     } else if file_path.starts_with("pg_multixact/members") {
         let slru = SlruKind::MultiXactMembers;
 
-        import_slru(modification, slru, file_path, reader, len, ctx).await?;
-        debug!("imported multixact members slru");
+        if modification.tline.tenant_shard_id.is_shard_zero() {
+            import_slru(modification, slru, file_path, reader, len, ctx).await?;
+            debug!("imported multixact members slru");
+        }
     } else if file_path.starts_with("pg_twophase") {
         let bytes = read_all_bytes(reader).await?;
 
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index a00ec761e2..255bd01e25 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -530,6 +530,7 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
+        assert!(self.tenant_shard_id.is_shard_zero());
         let n_blocks = self
             .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
             .await?;
@@ -552,6 +553,7 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
+        assert!(self.tenant_shard_id.is_shard_zero());
         let key = slru_block_to_key(kind, segno, blknum);
         self.get(key, lsn, ctx).await
     }
@@ -564,6 +566,7 @@ impl Timeline {
         version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
+        assert!(self.tenant_shard_id.is_shard_zero());
         let key = slru_segment_size_to_key(kind, segno);
         let mut buf = version.get(self, key, ctx).await?;
         Ok(buf.get_u32_le())
@@ -577,6 +580,7 @@ impl Timeline {
         version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
+        assert!(self.tenant_shard_id.is_shard_zero());
         // fetch directory listing
         let key = slru_dir_to_key(kind);
         let buf = version.get(self, key, ctx).await?;
@@ -1047,26 +1051,28 @@ impl Timeline {
         }
 
         // Iterate SLRUs next
-        for kind in [
-            SlruKind::Clog,
-            SlruKind::MultiXactMembers,
-            SlruKind::MultiXactOffsets,
-        ] {
-            let slrudir_key = slru_dir_to_key(kind);
-            result.add_key(slrudir_key);
-            let buf = self.get(slrudir_key, lsn, ctx).await?;
-            let dir = SlruSegmentDirectory::des(&buf)?;
-            let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
-            segments.sort_unstable();
-            for segno in segments {
-                let segsize_key = slru_segment_size_to_key(kind, segno);
-                let mut buf = self.get(segsize_key, lsn, ctx).await?;
-                let segsize = buf.get_u32_le();
+        if self.tenant_shard_id.is_shard_zero() {
+            for kind in [
+                SlruKind::Clog,
+                SlruKind::MultiXactMembers,
+                SlruKind::MultiXactOffsets,
+            ] {
+                let slrudir_key = slru_dir_to_key(kind);
+                result.add_key(slrudir_key);
+                let buf = self.get(slrudir_key, lsn, ctx).await?;
+                let dir = SlruSegmentDirectory::des(&buf)?;
+                let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
+                segments.sort_unstable();
+                for segno in segments {
+                    let segsize_key = slru_segment_size_to_key(kind, segno);
+                    let mut buf = self.get(segsize_key, lsn, ctx).await?;
+                    let segsize = buf.get_u32_le();
 
-                result.add_range(
-                    slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize),
-                );
-                result.add_key(segsize_key);
+                    result.add_range(
+                        slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize),
+                    );
+                    result.add_key(segsize_key);
+                }
             }
         }
 
@@ -1468,6 +1474,10 @@ impl<'a> DatadirModification<'a> {
         blknum: BlockNumber,
         rec: NeonWalRecord,
     ) -> anyhow::Result<()> {
+        if !self.tline.tenant_shard_id.is_shard_zero() {
+            return Ok(());
+        }
+
         self.put(
             slru_block_to_key(kind, segno, blknum),
             Value::WalRecord(rec),
@@ -1501,6 +1511,8 @@ impl<'a> DatadirModification<'a> {
         blknum: BlockNumber,
         img: Bytes,
     ) -> anyhow::Result<()> {
+        assert!(self.tline.tenant_shard_id.is_shard_zero());
+
         let key = slru_block_to_key(kind, segno, blknum);
         if !key.is_valid_key_on_write_path() {
             anyhow::bail!(
@@ -1542,6 +1554,7 @@ impl<'a> DatadirModification<'a> {
         segno: u32,
         blknum: BlockNumber,
     ) -> anyhow::Result<()> {
+        assert!(self.tline.tenant_shard_id.is_shard_zero());
         let key = slru_block_to_key(kind, segno, blknum);
         if !key.is_valid_key_on_write_path() {
             anyhow::bail!(
@@ -1853,6 +1866,8 @@ impl<'a> DatadirModification<'a> {
         nblocks: BlockNumber,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
+        assert!(self.tline.tenant_shard_id.is_shard_zero());
+
         // Add it to the directory entry
         let dir_key = slru_dir_to_key(kind);
         let buf = self.get(dir_key, ctx).await?;
@@ -1885,6 +1900,8 @@ impl<'a> DatadirModification<'a> {
         segno: u32,
         nblocks: BlockNumber,
     ) -> anyhow::Result<()> {
+        assert!(self.tline.tenant_shard_id.is_shard_zero());
+
         // Put size
         let size_key = slru_segment_size_to_key(kind, segno);
         let buf = nblocks.to_le_bytes();
diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
index cbd4168c06..4388072606 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -129,22 +129,23 @@ impl Flow {
         }
 
         // Import SLRUs
-
-        // pg_xact (01:00 keyspace)
-        self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact"))
+        if self.timeline.tenant_shard_id.is_shard_zero() {
+            // pg_xact (01:00 keyspace)
+            self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact"))
+                .await?;
+            // pg_multixact/members (01:01 keyspace)
+            self.import_slru(
+                SlruKind::MultiXactMembers,
+                &self.storage.pgdata().join("pg_multixact/members"),
+            )
             .await?;
-        // pg_multixact/members (01:01 keyspace)
-        self.import_slru(
-            SlruKind::MultiXactMembers,
-            &self.storage.pgdata().join("pg_multixact/members"),
-        )
-        .await?;
-        // pg_multixact/offsets (01:02 keyspace)
-        self.import_slru(
-            SlruKind::MultiXactOffsets,
-            &self.storage.pgdata().join("pg_multixact/offsets"),
-        )
-        .await?;
+            // pg_multixact/offsets (01:02 keyspace)
+            self.import_slru(
+                SlruKind::MultiXactOffsets,
+                &self.storage.pgdata().join("pg_multixact/offsets"),
+            )
+            .await?;
+        }
 
         // Import pg_twophase.
         // TODO: as empty
@@ -302,6 +303,8 @@ impl Flow {
     }
 
     async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> {
+        assert!(self.timeline.tenant_shard_id.is_shard_zero());
+
         let segments = self.storage.listfilesindir(path).await?;
         let segments: Vec<(String, u32, usize)> = segments
             .into_iter()
@@ -337,7 +340,6 @@ impl Flow {
             debug!(%p, segno=%segno, %size, %start_key, %end_key, "scheduling SLRU segment");
             self.tasks
                 .push(AnyImportTask::SlruBlocks(ImportSlruBlocksTask::new(
-                    *self.timeline.get_shard_identity(),
                     start_key..end_key,
                     &p,
                     self.storage.clone(),
@@ -631,21 +633,14 @@ impl ImportTask for ImportRelBlocksTask {
 }
 
 struct ImportSlruBlocksTask {
-    shard_identity: ShardIdentity,
     key_range: Range<Key>,
     path: RemotePath,
     storage: RemoteStorageWrapper,
 }
 
 impl ImportSlruBlocksTask {
-    fn new(
-        shard_identity: ShardIdentity,
-        key_range: Range<Key>,
-        path: &RemotePath,
-        storage: RemoteStorageWrapper,
-    ) -> Self {
+    fn new(key_range: Range<Key>, path: &RemotePath, storage: RemoteStorageWrapper) -> Self {
         ImportSlruBlocksTask {
-            shard_identity,
             key_range,
             path: path.clone(),
             storage,
@@ -673,17 +668,13 @@ impl ImportTask for ImportSlruBlocksTask {
         let mut file_offset = 0;
         while blknum < end_blk {
             let key = slru_block_to_key(kind, segno, blknum);
-            assert!(
-                !self.shard_identity.is_key_disposable(&key),
-                "SLRU keys need to go into every shard"
-            );
             let buf = &buf[file_offset..(file_offset + 8192)];
             file_offset += 8192;
             layer_writer
                 .put_image(key, Bytes::copy_from_slice(buf), ctx)
                 .await?;
-            blknum += 1;
             nimages += 1;
+            blknum += 1;
         }
         Ok(nimages)
     }
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index d568da596a..93ae88936f 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1392,6 +1392,10 @@ impl WalIngest {
         img: Bytes,
         ctx: &RequestContext,
     ) -> Result<()> {
+        if !self.shard.is_shard_zero() {
+            return Ok(());
+        }
+
         self.handle_slru_extend(modification, kind, segno, blknum, ctx)
             .await?;
         modification.put_slru_page_image(kind, segno, blknum, img)?;

From f7d5322e8b428ea4277e2bdf572612e4733d193a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 3 Dec 2024 18:36:37 +0000
Subject: [PATCH 172/209] pageserver: more detailed logs when calling re-attach
 (#9996)

## Problem

We saw a peculiar case where a pageserver apparently got a 0-tenant
response to `/re-attach` but we couldn't see the request landing on a
storage controller. It was hard to confirm retrospectively that the
pageserver was configured properly at the moment it sent the request.

## Summary of changes

- Log the URL to which we are sending the request
- Log the NodeId and metadata that we sent
---
 libs/pageserver_api/src/controller_api.rs  |  4 ++--
 pageserver/src/controller_upcall_client.rs | 12 +++++++++---
 pageserver/src/tenant/mgr.rs               |  2 +-
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 0ea30ce54f..9a5ebc95bd 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -48,7 +48,7 @@ pub struct TenantCreateResponse {
     pub shards: Vec<TenantCreateResponseShard>,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Debug, Clone)]
 pub struct NodeRegisterRequest {
     pub node_id: NodeId,
 
@@ -75,7 +75,7 @@ pub struct TenantPolicyRequest {
     pub scheduling: Option<ShardSchedulingPolicy>,
 }
 
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug)]
 pub struct AvailabilityZone(pub String);
 
 impl Display for AvailabilityZone {
diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs
index 73fc6dc3ab..d41bfd9021 100644
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -115,6 +115,10 @@ impl ControllerUpcallClient {
 
         Ok(res)
     }
+
+    pub(crate) fn base_url(&self) -> &Url {
+        &self.base_url
+    }
 }
 
 impl ControlPlaneGenerationsApi for ControllerUpcallClient {
@@ -191,13 +195,15 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
 
         let request = ReAttachRequest {
             node_id: self.node_id,
-            register,
+            register: register.clone(),
         };
 
         let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
         tracing::info!(
-            "Received re-attach response with {} tenants",
-            response.tenants.len()
+            "Received re-attach response with {} tenants (node {}, register: {:?})",
+            response.tenants.len(),
+            self.node_id,
+            register,
         );
 
         failpoint_support::sleep_millis_async!("control-plane-client-re-attach");
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index eb8191e43e..45481c4ed4 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -347,7 +347,7 @@ async fn init_load_generations(
         );
         emergency_generations(tenant_confs)
     } else if let Some(client) = ControllerUpcallClient::new(conf, cancel) {
-        info!("Calling control plane API to re-attach tenants");
+        info!("Calling {} API to re-attach tenants", client.base_url());
         // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
         match client.re_attach(conf).await {
             Ok(tenants) => tenants

From 60241127e29cee199fc75fe5ddc15b6729d96aa7 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 3 Dec 2024 18:39:23 +0000
Subject: [PATCH 173/209] chore(proxy): remove postgres config parser and md5
 support (#9990)

Keeping the `mock` postgres cplane adaptor using "stock" tokio-postgres
allows us to remove a lot of dead weight from our actual postgres
connection logic.
---
 Cargo.lock                                    |   2 +-
 libs/proxy/postgres-protocol2/Cargo.toml      |   1 -
 .../src/authentication/mod.rs                 |  35 --
 .../postgres-protocol2/src/message/backend.rs |   8 +-
 .../postgres-protocol2/src/password/mod.rs    |  18 -
 .../postgres-protocol2/src/password/test.rs   |   8 -
 libs/proxy/tokio-postgres2/src/config.rs      | 465 +-----------------
 libs/proxy/tokio-postgres2/src/connect_raw.rs |  19 +-
 libs/proxy/tokio-postgres2/src/error/mod.rs   |   6 -
 libs/proxy/tokio-postgres2/src/lib.rs         |  20 -
 proxy/Cargo.toml                              |   6 +-
 proxy/src/auth/backend/classic.rs             |   2 +-
 proxy/src/auth/backend/console_redirect.rs    |   2 +-
 proxy/src/auth/backend/mod.rs                 |   2 +-
 proxy/src/auth/flow.rs                        |   2 +-
 proxy/src/cancellation.rs                     |   6 +-
 proxy/src/compute.rs                          |  26 +-
 proxy/src/control_plane/client/mock.rs        |   3 +-
 proxy/src/control_plane/client/neon.rs        |   2 +-
 proxy/src/error.rs                            |   2 +-
 proxy/src/postgres_rustls/mod.rs              |   6 +-
 proxy/src/proxy/retry.rs                      |  16 +-
 proxy/src/proxy/tests/mitm.rs                 |  22 +-
 proxy/src/proxy/tests/mod.rs                  |  20 +-
 proxy/src/serverless/backend.rs               |  18 +-
 proxy/src/serverless/conn_pool.rs             |   6 +-
 proxy/src/serverless/conn_pool_lib.rs         |   4 +-
 proxy/src/serverless/json.rs                  |   6 +-
 proxy/src/serverless/local_conn_pool.rs       |  10 +-
 proxy/src/serverless/sql_over_http.rs         |  18 +-
 30 files changed, 96 insertions(+), 665 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b2769e59f0..5b80ec5e93 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4209,7 +4209,6 @@ dependencies = [
  "bytes",
  "fallible-iterator",
  "hmac",
- "md-5",
  "memchr",
  "rand 0.8.5",
  "sha2",
@@ -4612,6 +4611,7 @@ dependencies = [
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
  "tokio",
+ "tokio-postgres",
  "tokio-postgres2",
  "tokio-rustls 0.26.0",
  "tokio-tungstenite",
diff --git a/libs/proxy/postgres-protocol2/Cargo.toml b/libs/proxy/postgres-protocol2/Cargo.toml
index 284a632954..f71c1599c7 100644
--- a/libs/proxy/postgres-protocol2/Cargo.toml
+++ b/libs/proxy/postgres-protocol2/Cargo.toml
@@ -10,7 +10,6 @@ byteorder.workspace = true
 bytes.workspace = true
 fallible-iterator.workspace = true
 hmac.workspace = true
-md-5 = "0.10"
 memchr = "2.0"
 rand.workspace = true
 sha2.workspace = true
diff --git a/libs/proxy/postgres-protocol2/src/authentication/mod.rs b/libs/proxy/postgres-protocol2/src/authentication/mod.rs
index 71afa4b9b6..0bdc177143 100644
--- a/libs/proxy/postgres-protocol2/src/authentication/mod.rs
+++ b/libs/proxy/postgres-protocol2/src/authentication/mod.rs
@@ -1,37 +1,2 @@
 //! Authentication protocol support.
-use md5::{Digest, Md5};
-
 pub mod sasl;
-
-/// Hashes authentication information in a way suitable for use in response
-/// to an `AuthenticationMd5Password` message.
-///
-/// The resulting string should be sent back to the database in a
-/// `PasswordMessage` message.
-#[inline]
-pub fn md5_hash(username: &[u8], password: &[u8], salt: [u8; 4]) -> String {
-    let mut md5 = Md5::new();
-    md5.update(password);
-    md5.update(username);
-    let output = md5.finalize_reset();
-    md5.update(format!("{:x}", output));
-    md5.update(salt);
-    format!("md5{:x}", md5.finalize())
-}
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    #[test]
-    fn md5() {
-        let username = b"md5_user";
-        let password = b"password";
-        let salt = [0x2a, 0x3d, 0x8f, 0xe0];
-
-        assert_eq!(
-            md5_hash(username, password, salt),
-            "md562af4dd09bbb41884907a838a3233294"
-        );
-    }
-}
diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs
index 33d77fc252..097964f9c1 100644
--- a/libs/proxy/postgres-protocol2/src/message/backend.rs
+++ b/libs/proxy/postgres-protocol2/src/message/backend.rs
@@ -79,7 +79,7 @@ pub enum Message {
     AuthenticationCleartextPassword,
     AuthenticationGss,
     AuthenticationKerberosV5,
-    AuthenticationMd5Password(AuthenticationMd5PasswordBody),
+    AuthenticationMd5Password,
     AuthenticationOk,
     AuthenticationScmCredential,
     AuthenticationSspi,
@@ -191,11 +191,7 @@ impl Message {
                 0 => Message::AuthenticationOk,
                 2 => Message::AuthenticationKerberosV5,
                 3 => Message::AuthenticationCleartextPassword,
-                5 => {
-                    let mut salt = [0; 4];
-                    buf.read_exact(&mut salt)?;
-                    Message::AuthenticationMd5Password(AuthenticationMd5PasswordBody { salt })
-                }
+                5 => Message::AuthenticationMd5Password,
                 6 => Message::AuthenticationScmCredential,
                 7 => Message::AuthenticationGss,
                 8 => Message::AuthenticationGssContinue,
diff --git a/libs/proxy/postgres-protocol2/src/password/mod.rs b/libs/proxy/postgres-protocol2/src/password/mod.rs
index e669e80f3f..38eb31dfcf 100644
--- a/libs/proxy/postgres-protocol2/src/password/mod.rs
+++ b/libs/proxy/postgres-protocol2/src/password/mod.rs
@@ -8,7 +8,6 @@
 
 use crate::authentication::sasl;
 use hmac::{Hmac, Mac};
-use md5::Md5;
 use rand::RngCore;
 use sha2::digest::FixedOutput;
 use sha2::{Digest, Sha256};
@@ -88,20 +87,3 @@ pub(crate) async fn scram_sha_256_salt(
         base64::encode(server_key)
     )
 }
-
-/// **Not recommended, as MD5 is not considered to be secure.**
-///
-/// Hash password using MD5 with the username as the salt.
-///
-/// The client may assume the returned string doesn't contain any
-/// special characters that would require escaping.
-pub fn md5(password: &[u8], username: &str) -> String {
-    // salt password with username
-    let mut salted_password = Vec::from(password);
-    salted_password.extend_from_slice(username.as_bytes());
-
-    let mut hash = Md5::new();
-    hash.update(&salted_password);
-    let digest = hash.finalize();
-    format!("md5{:x}", digest)
-}
diff --git a/libs/proxy/postgres-protocol2/src/password/test.rs b/libs/proxy/postgres-protocol2/src/password/test.rs
index c9d340f09d..0692c07adb 100644
--- a/libs/proxy/postgres-protocol2/src/password/test.rs
+++ b/libs/proxy/postgres-protocol2/src/password/test.rs
@@ -9,11 +9,3 @@ async fn test_encrypt_scram_sha_256() {
         "SCRAM-SHA-256$4096:AQIDBAUGBwgJCgsMDQ4PEA==$8rrDg00OqaiWXJ7p+sCgHEIaBSHY89ZJl3mfIsf32oY=:05L1f+yZbiN8O0AnO40Og85NNRhvzTS57naKRWCcsIA="
     );
 }
-
-#[test]
-fn test_encrypt_md5() {
-    assert_eq!(
-        password::md5(b"secret", "foo"),
-        "md54ab2c5d00339c4b2a4e921d2dc4edec7"
-    );
-}
diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
index 26124b38ef..5dad835c3b 100644
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -6,11 +6,9 @@ use crate::connect_raw::RawConnection;
 use crate::tls::MakeTlsConnect;
 use crate::tls::TlsConnect;
 use crate::{Client, Connection, Error};
-use std::borrow::Cow;
+use std::fmt;
 use std::str;
-use std::str::FromStr;
 use std::time::Duration;
-use std::{error, fmt, iter, mem};
 use tokio::io::{AsyncRead, AsyncWrite};
 
 pub use postgres_protocol2::authentication::sasl::ScramKeys;
@@ -380,99 +378,6 @@ impl Config {
         self.max_backend_message_size
     }
 
-    fn param(&mut self, key: &str, value: &str) -> Result<(), Error> {
-        match key {
-            "user" => {
-                self.user(value);
-            }
-            "password" => {
-                self.password(value);
-            }
-            "dbname" => {
-                self.dbname(value);
-            }
-            "options" => {
-                self.options(value);
-            }
-            "application_name" => {
-                self.application_name(value);
-            }
-            "sslmode" => {
-                let mode = match value {
-                    "disable" => SslMode::Disable,
-                    "prefer" => SslMode::Prefer,
-                    "require" => SslMode::Require,
-                    _ => return Err(Error::config_parse(Box::new(InvalidValue("sslmode")))),
-                };
-                self.ssl_mode(mode);
-            }
-            "host" => {
-                for host in value.split(',') {
-                    self.host(host);
-                }
-            }
-            "port" => {
-                for port in value.split(',') {
-                    let port = if port.is_empty() {
-                        5432
-                    } else {
-                        port.parse()
-                            .map_err(|_| Error::config_parse(Box::new(InvalidValue("port"))))?
-                    };
-                    self.port(port);
-                }
-            }
-            "connect_timeout" => {
-                let timeout = value
-                    .parse::<i64>()
-                    .map_err(|_| Error::config_parse(Box::new(InvalidValue("connect_timeout"))))?;
-                if timeout > 0 {
-                    self.connect_timeout(Duration::from_secs(timeout as u64));
-                }
-            }
-            "target_session_attrs" => {
-                let target_session_attrs = match value {
-                    "any" => TargetSessionAttrs::Any,
-                    "read-write" => TargetSessionAttrs::ReadWrite,
-                    _ => {
-                        return Err(Error::config_parse(Box::new(InvalidValue(
-                            "target_session_attrs",
-                        ))));
-                    }
-                };
-                self.target_session_attrs(target_session_attrs);
-            }
-            "channel_binding" => {
-                let channel_binding = match value {
-                    "disable" => ChannelBinding::Disable,
-                    "prefer" => ChannelBinding::Prefer,
-                    "require" => ChannelBinding::Require,
-                    _ => {
-                        return Err(Error::config_parse(Box::new(InvalidValue(
-                            "channel_binding",
-                        ))))
-                    }
-                };
-                self.channel_binding(channel_binding);
-            }
-            "max_backend_message_size" => {
-                let limit = value.parse::<usize>().map_err(|_| {
-                    Error::config_parse(Box::new(InvalidValue("max_backend_message_size")))
-                })?;
-                if limit > 0 {
-                    self.max_backend_message_size(limit);
-                }
-            }
-            key => {
-                return Err(Error::config_parse(Box::new(UnknownOption(
-                    key.to_string(),
-                ))));
-            }
-        }
-
-        Ok(())
-    }
-
     /// Opens a connection to a PostgreSQL database.
     ///
     /// Requires the `runtime` Cargo feature (enabled by default).
@@ -499,17 +404,6 @@ impl Config {
     }
 }
 
-impl FromStr for Config {
-    type Err = Error;
-
-    fn from_str(s: &str) -> Result<Config, Error> {
-        match UrlParser::parse(s)? {
-            Some(config) => Ok(config),
-            None => Parser::parse(s),
-        }
-    }
-}
-
 // Omit password from debug output
 impl fmt::Debug for Config {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@@ -536,360 +430,3 @@ impl fmt::Debug for Config {
             .finish()
     }
 }
-
-#[derive(Debug)]
-struct UnknownOption(String);
-
-impl fmt::Display for UnknownOption {
-    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(fmt, "unknown option `{}`", self.0)
-    }
-}
-
-impl error::Error for UnknownOption {}
-
-#[derive(Debug)]
-struct InvalidValue(&'static str);
-
-impl fmt::Display for InvalidValue {
-    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(fmt, "invalid value for option `{}`", self.0)
-    }
-}
-
-impl error::Error for InvalidValue {}
-
-struct Parser<'a> {
-    s: &'a str,
-    it: iter::Peekable<str::CharIndices<'a>>,
-}
-
-impl<'a> Parser<'a> {
-    fn parse(s: &'a str) -> Result<Config, Error> {
-        let mut parser = Parser {
-            s,
-            it: s.char_indices().peekable(),
-        };
-
-        let mut config = Config::new();
-
-        while let Some((key, value)) = parser.parameter()? {
-            config.param(key, &value)?;
-        }
-
-        Ok(config)
-    }
-
-    fn skip_ws(&mut self) {
-        self.take_while(char::is_whitespace);
-    }
-
-    fn take_while<F>(&mut self, f: F) -> &'a str
-    where
-        F: Fn(char) -> bool,
-    {
-        let start = match self.it.peek() {
-            Some(&(i, _)) => i,
-            None => return "",
-        };
-
-        loop {
-            match self.it.peek() {
-                Some(&(_, c)) if f(c) => {
-                    self.it.next();
-                }
-                Some(&(i, _)) => return &self.s[start..i],
-                None => return &self.s[start..],
-            }
-        }
-    }
-
-    fn eat(&mut self, target: char) -> Result<(), Error> {
-        match self.it.next() {
-            Some((_, c)) if c == target => Ok(()),
-            Some((i, c)) => {
-                let m = format!(
-                    "unexpected character at byte {}: expected `{}` but got `{}`",
-                    i, target, c
-                );
-                Err(Error::config_parse(m.into()))
-            }
-            None => Err(Error::config_parse("unexpected EOF".into())),
-        }
-    }
-
-    fn eat_if(&mut self, target: char) -> bool {
-        match self.it.peek() {
-            Some(&(_, c)) if c == target => {
-                self.it.next();
-                true
-            }
-            _ => false,
-        }
-    }
-
-    fn keyword(&mut self) -> Option<&'a str> {
-        let s = self.take_while(|c| match c {
-            c if c.is_whitespace() => false,
-            '=' => false,
-            _ => true,
-        });
-
-        if s.is_empty() {
-            None
-        } else {
-            Some(s)
-        }
-    }
-
-    fn value(&mut self) -> Result<String, Error> {
-        let value = if self.eat_if('\'') {
-            let value = self.quoted_value()?;
-            self.eat('\'')?;
-            value
-        } else {
-            self.simple_value()?
-        };
-
-        Ok(value)
-    }
-
-    fn simple_value(&mut self) -> Result<String, Error> {
-        let mut value = String::new();
-
-        while let Some(&(_, c)) = self.it.peek() {
-            if c.is_whitespace() {
-                break;
-            }
-
-            self.it.next();
-            if c == '\\' {
-                if let Some((_, c2)) = self.it.next() {
-                    value.push(c2);
-                }
-            } else {
-                value.push(c);
-            }
-        }
-
-        if value.is_empty() {
-            return Err(Error::config_parse("unexpected EOF".into()));
-        }
-
-        Ok(value)
-    }
-
-    fn quoted_value(&mut self) -> Result<String, Error> {
-        let mut value = String::new();
-
-        while let Some(&(_, c)) = self.it.peek() {
-            if c == '\'' {
-                return Ok(value);
-            }
-
-            self.it.next();
-            if c == '\\' {
-                if let Some((_, c2)) = self.it.next() {
-                    value.push(c2);
-                }
-            } else {
-                value.push(c);
-            }
-        }
-
-        Err(Error::config_parse(
-            "unterminated quoted connection parameter value".into(),
-        ))
-    }
-
-    fn parameter(&mut self) -> Result<Option<(&'a str, String)>, Error> {
-        self.skip_ws();
-        let keyword = match self.keyword() {
-            Some(keyword) => keyword,
-            None => return Ok(None),
-        };
-        self.skip_ws();
-        self.eat('=')?;
-        self.skip_ws();
-        let value = self.value()?;
-
-        Ok(Some((keyword, value)))
-    }
-}
-
-// This is a pretty sloppy "URL" parser, but it matches the behavior of libpq, where things really aren't very strict
-struct UrlParser<'a> {
-    s: &'a str,
-    config: Config,
-}
-
-impl<'a> UrlParser<'a> {
-    fn parse(s: &'a str) -> Result<Option<Config>, Error> {
-        let s = match Self::remove_url_prefix(s) {
-            Some(s) => s,
-            None => return Ok(None),
-        };
-
-        let mut parser = UrlParser {
-            s,
-            config: Config::new(),
-        };
-
-        parser.parse_credentials()?;
-        parser.parse_host()?;
-        parser.parse_path()?;
-        parser.parse_params()?;
-
-        Ok(Some(parser.config))
-    }
-
-    fn remove_url_prefix(s: &str) -> Option<&str> {
-        for prefix in &["postgres://", "postgresql://"] {
-            if let Some(stripped) = s.strip_prefix(prefix) {
-                return Some(stripped);
-            }
-        }
-
-        None
-    }
-
-    fn take_until(&mut self, end: &[char]) -> Option<&'a str> {
-        match self.s.find(end) {
-            Some(pos) => {
-                let (head, tail) = self.s.split_at(pos);
-                self.s = tail;
-                Some(head)
-            }
-            None => None,
-        }
-    }
-
-    fn take_all(&mut self) -> &'a str {
-        mem::take(&mut self.s)
-    }
-
-    fn eat_byte(&mut self) {
-        self.s = &self.s[1..];
-    }
-
-    fn parse_credentials(&mut self) -> Result<(), Error> {
-        let creds = match self.take_until(&['@']) {
-            Some(creds) => creds,
-            None => return Ok(()),
-        };
-        self.eat_byte();
-
-        let mut it = creds.splitn(2, ':');
-        let user = self.decode(it.next().unwrap())?;
-        self.config.user(&user);
-
-        if let Some(password) = it.next() {
-            let password = Cow::from(percent_encoding::percent_decode(password.as_bytes()));
-            self.config.password(password);
-        }
-
-        Ok(())
-    }
-
-    fn parse_host(&mut self) -> Result<(), Error> {
-        let host = match self.take_until(&['/', '?']) {
-            Some(host) => host,
-            None => self.take_all(),
-        };
-
-        if host.is_empty() {
-            return Ok(());
-        }
-
-        for chunk in host.split(',') {
-            let (host, port) = if chunk.starts_with('[') {
-                let idx = match chunk.find(']') {
-                    Some(idx) => idx,
-                    None => return Err(Error::config_parse(InvalidValue("host").into())),
-                };
-
-                let host = &chunk[1..idx];
-                let remaining = &chunk[idx + 1..];
-                let port = if let Some(port) = remaining.strip_prefix(':') {
-                    Some(port)
-                } else if remaining.is_empty() {
-                    None
-                } else {
-                    return Err(Error::config_parse(InvalidValue("host").into()));
-                };
-
-                (host, port)
-            } else {
-                let mut it = chunk.splitn(2, ':');
-                (it.next().unwrap(), it.next())
-            };
-
-            self.host_param(host)?;
-            let port = self.decode(port.unwrap_or("5432"))?;
-            self.config.param("port", &port)?;
-        }
-
-        Ok(())
-    }
-
-    fn parse_path(&mut self) -> Result<(), Error> {
-        if !self.s.starts_with('/') {
-            return Ok(());
-        }
-        self.eat_byte();
-
-        let dbname = match self.take_until(&['?']) {
-            Some(dbname) => dbname,
-            None => self.take_all(),
-        };
-
-        if !dbname.is_empty() {
-            self.config.dbname(&self.decode(dbname)?);
-        }
-
-        Ok(())
-    }
-
-    fn parse_params(&mut self) -> Result<(), Error> {
-        if !self.s.starts_with('?') {
-            return Ok(());
-        }
-        self.eat_byte();
-
-        while !self.s.is_empty() {
-            let key = match self.take_until(&['=']) {
-                Some(key) => self.decode(key)?,
-                None => return Err(Error::config_parse("unterminated parameter".into())),
-            };
-            self.eat_byte();
-
-            let value = match self.take_until(&['&']) {
-                Some(value) => {
-                    self.eat_byte();
-                    value
-                }
-                None => self.take_all(),
-            };
-
-            if key == "host" {
-                self.host_param(value)?;
-            } else {
-                let value = self.decode(value)?;
-                self.config.param(&key, &value)?;
-            }
-        }
-
-        Ok(())
-    }
-
-    fn host_param(&mut self, s: &str) -> Result<(), Error> {
-        let s = self.decode(s)?;
-        self.config.param("host", &s)
-    }
-
-    fn decode(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
-        percent_encoding::percent_decode(s.as_bytes())
-            .decode_utf8()
-            .map_err(|e| Error::config_parse(e.into()))
-    }
-}
diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs
index 9c6f1a2552..390f133002 100644
--- a/libs/proxy/tokio-postgres2/src/connect_raw.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs
@@ -7,7 +7,6 @@ use crate::Error;
 use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
 use futures_util::{ready, Sink, SinkExt, Stream, TryStreamExt};
-use postgres_protocol2::authentication;
 use postgres_protocol2::authentication::sasl;
 use postgres_protocol2::authentication::sasl::ScramSha256;
 use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody};
@@ -174,25 +173,11 @@ where
 
             authenticate_password(stream, pass).await?;
         }
-        Some(Message::AuthenticationMd5Password(body)) => {
-            can_skip_channel_binding(config)?;
-
-            let user = config
-                .user
-                .as_ref()
-                .ok_or_else(|| Error::config("user missing".into()))?;
-            let pass = config
-                .password
-                .as_ref()
-                .ok_or_else(|| Error::config("password missing".into()))?;
-
-            let output = authentication::md5_hash(user.as_bytes(), pass, body.salt());
-            authenticate_password(stream, output.as_bytes()).await?;
-        }
         Some(Message::AuthenticationSasl(body)) => {
             authenticate_sasl(stream, body, config).await?;
         }
-        Some(Message::AuthenticationKerberosV5)
+        Some(Message::AuthenticationMd5Password)
+        | Some(Message::AuthenticationKerberosV5)
         | Some(Message::AuthenticationScmCredential)
         | Some(Message::AuthenticationGss)
         | Some(Message::AuthenticationSspi) => {
diff --git a/libs/proxy/tokio-postgres2/src/error/mod.rs b/libs/proxy/tokio-postgres2/src/error/mod.rs
index 6514322250..922c348525 100644
--- a/libs/proxy/tokio-postgres2/src/error/mod.rs
+++ b/libs/proxy/tokio-postgres2/src/error/mod.rs
@@ -349,7 +349,6 @@ enum Kind {
     Parse,
     Encode,
     Authentication,
-    ConfigParse,
     Config,
     Connect,
     Timeout,
@@ -386,7 +385,6 @@ impl fmt::Display for Error {
             Kind::Parse => fmt.write_str("error parsing response from server")?,
             Kind::Encode => fmt.write_str("error encoding message to server")?,
             Kind::Authentication => fmt.write_str("authentication error")?,
-            Kind::ConfigParse => fmt.write_str("invalid connection string")?,
             Kind::Config => fmt.write_str("invalid configuration")?,
             Kind::Connect => fmt.write_str("error connecting to server")?,
             Kind::Timeout => fmt.write_str("timeout waiting for server")?,
@@ -482,10 +480,6 @@ impl Error {
         Error::new(Kind::Authentication, Some(e))
     }
 
-    pub(crate) fn config_parse(e: Box<dyn error::Error + Sync + Send>) -> Error {
-        Error::new(Kind::ConfigParse, Some(e))
-    }
-
     pub(crate) fn config(e: Box<dyn error::Error + Sync + Send>) -> Error {
         Error::new(Kind::Config, Some(e))
     }
diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs
index 57c639a7de..901ed0c96c 100644
--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -13,14 +13,12 @@ pub use crate::query::RowStream;
 pub use crate::row::{Row, SimpleQueryRow};
 pub use crate::simple_query::SimpleQueryStream;
 pub use crate::statement::{Column, Statement};
-use crate::tls::MakeTlsConnect;
 pub use crate::tls::NoTls;
 pub use crate::to_statement::ToStatement;
 pub use crate::transaction::Transaction;
 pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder};
 use crate::types::ToSql;
 use postgres_protocol2::message::backend::ReadyForQueryBody;
-use tokio::net::TcpStream;
 
 /// After executing a query, the connection will be in one of these states
 #[derive(Clone, Copy, Debug, PartialEq)]
@@ -72,24 +70,6 @@ mod transaction;
 mod transaction_builder;
 pub mod types;
 
-/// A convenience function which parses a connection string and connects to the database.
-///
-/// See the documentation for [`Config`] for details on the connection string format.
-///
-/// Requires the `runtime` Cargo feature (enabled by default).
-///
-/// [`Config`]: config/struct.Config.html
-pub async fn connect<T>(
-    config: &str,
-    tls: T,
-) -> Result<(Client, Connection<TcpStream, T::Stream>), Error>
-where
-    T: MakeTlsConnect<TcpStream>,
-{
-    let config = config.parse::<Config>()?;
-    config.connect(tls).await
-}
-
 /// An asynchronous notification.
 #[derive(Clone, Debug)]
 pub struct Notification {
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index f5934c8a89..2f63ee3acc 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -6,7 +6,7 @@ license.workspace = true
 
 [features]
 default = []
-testing = []
+testing = ["dep:tokio-postgres"]
 
 [dependencies]
 ahash.workspace = true
@@ -55,6 +55,7 @@ parquet.workspace = true
 parquet_derive.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
+postgres-client = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" }
 postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" }
 pq_proto.workspace = true
 prometheus.workspace = true
@@ -81,7 +82,7 @@ subtle.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
-tokio-postgres = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" }
+tokio-postgres = { workspace = true, optional = true }
 tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
@@ -119,3 +120,4 @@ rcgen.workspace = true
 rstest.workspace = true
 walkdir.workspace = true
 rand_distr = "0.4"
+tokio-postgres.workspace = true
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 491b272ac4..5e494dfdd6 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -66,7 +66,7 @@ pub(super) async fn authenticate(
 
     Ok(ComputeCredentials {
         info: creds,
-        keys: ComputeCredentialKeys::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
+        keys: ComputeCredentialKeys::AuthKeys(postgres_client::config::AuthKeys::ScramSha256(
             scram_keys,
         )),
     })
diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index bf7a1cb070..494564de05 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -1,8 +1,8 @@
 use async_trait::async_trait;
+use postgres_client::config::SslMode;
 use pq_proto::BeMessage as Be;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_postgres::config::SslMode;
 use tracing::{info, info_span};
 
 use super::ComputeCredentialKeys;
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 7e1b26a11a..84a572dcf9 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -11,8 +11,8 @@ pub use console_redirect::ConsoleRedirectBackend;
 pub(crate) use console_redirect::ConsoleRedirectError;
 use ipnet::{Ipv4Net, Ipv6Net};
 use local::LocalBackend;
+use postgres_client::config::AuthKeys;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_postgres::config::AuthKeys;
 use tracing::{debug, info, warn};
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 9c6ce151cb..60d1962d7f 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -227,7 +227,7 @@ pub(crate) async fn validate_password_and_exchange(
             };
 
             Ok(sasl::Outcome::Success(ComputeCredentialKeys::AuthKeys(
-                tokio_postgres::config::AuthKeys::ScramSha256(keys),
+                postgres_client::config::AuthKeys::ScramSha256(keys),
             )))
         }
     }
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 91e198bf88..bcb0ef40bd 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -3,11 +3,11 @@ use std::sync::Arc;
 
 use dashmap::DashMap;
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
+use postgres_client::{CancelToken, NoTls};
 use pq_proto::CancelKeyData;
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio::sync::Mutex;
-use tokio_postgres::{CancelToken, NoTls};
 use tracing::{debug, info};
 use uuid::Uuid;
 
@@ -44,7 +44,7 @@ pub(crate) enum CancelError {
     IO(#[from] std::io::Error),
 
     #[error("{0}")]
-    Postgres(#[from] tokio_postgres::Error),
+    Postgres(#[from] postgres_client::Error),
 
     #[error("rate limit exceeded")]
     RateLimit,
@@ -70,7 +70,7 @@ impl ReportableError for CancelError {
 impl<P: CancellationPublisher> CancellationHandler<P> {
     /// Run async action within an ephemeral session identified by [`CancelKeyData`].
     pub(crate) fn get_session(self: Arc<Self>) -> Session<P> {
-        // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
+        // HACK: We'd rather get the real backend_pid but postgres_client doesn't
         // expose it and we don't want to do another roundtrip to query
         // for it. The client will be able to notice that this is not the
         // actual backend_pid, but backend_pid is not used for anything
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index b689b97a21..06bc71c559 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -6,6 +6,8 @@ use std::time::Duration;
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
+use postgres_client::tls::MakeTlsConnect;
+use postgres_client::{CancelToken, RawConnection};
 use postgres_protocol::message::backend::NoticeResponseBody;
 use pq_proto::StartupMessageParams;
 use rustls::client::danger::ServerCertVerifier;
@@ -13,8 +15,6 @@ use rustls::crypto::ring;
 use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
 use tokio::net::TcpStream;
-use tokio_postgres::tls::MakeTlsConnect;
-use tokio_postgres::{CancelToken, RawConnection};
 use tracing::{debug, error, info, warn};
 
 use crate::auth::parse_endpoint_param;
@@ -34,9 +34,9 @@ pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
 #[derive(Debug, Error)]
 pub(crate) enum ConnectionError {
     /// This error doesn't seem to reveal any secrets; for instance,
-    /// `tokio_postgres::error::Kind` doesn't contain ip addresses and such.
+    /// `postgres_client::error::Kind` doesn't contain ip addresses and such.
     #[error("{COULD_NOT_CONNECT}: {0}")]
-    Postgres(#[from] tokio_postgres::Error),
+    Postgres(#[from] postgres_client::Error),
 
     #[error("{COULD_NOT_CONNECT}: {0}")]
     CouldNotConnect(#[from] io::Error),
@@ -99,13 +99,13 @@ impl ReportableError for ConnectionError {
 }
 
 /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`.
-pub(crate) type ScramKeys = tokio_postgres::config::ScramKeys<32>;
+pub(crate) type ScramKeys = postgres_client::config::ScramKeys<32>;
 
 /// A config for establishing a connection to compute node.
-/// Eventually, `tokio_postgres` will be replaced with something better.
+/// Eventually, `postgres_client` will be replaced with something better.
 /// Newtype allows us to implement methods on top of it.
 #[derive(Clone, Default)]
-pub(crate) struct ConnCfg(Box<tokio_postgres::Config>);
+pub(crate) struct ConnCfg(Box<postgres_client::Config>);
 
 /// Creation and initialization routines.
 impl ConnCfg {
@@ -126,7 +126,7 @@ impl ConnCfg {
 
     pub(crate) fn get_host(&self) -> Result<Host, WakeComputeError> {
         match self.0.get_hosts() {
-            [tokio_postgres::config::Host::Tcp(s)] => Ok(s.into()),
+            [postgres_client::config::Host::Tcp(s)] => Ok(s.into()),
             // we should not have multiple address or unix addresses.
             _ => Err(WakeComputeError::BadComputeAddress(
                 "invalid compute address".into(),
@@ -160,7 +160,7 @@ impl ConnCfg {
 
         // TODO: This is especially ugly...
         if let Some(replication) = params.get("replication") {
-            use tokio_postgres::config::ReplicationMode;
+            use postgres_client::config::ReplicationMode;
             match replication {
                 "true" | "on" | "yes" | "1" => {
                     self.replication_mode(ReplicationMode::Physical);
@@ -182,7 +182,7 @@ impl ConnCfg {
 }
 
 impl std::ops::Deref for ConnCfg {
-    type Target = tokio_postgres::Config;
+    type Target = postgres_client::Config;
 
     fn deref(&self) -> &Self::Target {
         &self.0
@@ -199,7 +199,7 @@ impl std::ops::DerefMut for ConnCfg {
 impl ConnCfg {
     /// Establish a raw TCP connection to the compute node.
     async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> {
-        use tokio_postgres::config::Host;
+        use postgres_client::config::Host;
 
         // wrap TcpStream::connect with timeout
         let connect_with_timeout = |host, port| {
@@ -224,7 +224,7 @@ impl ConnCfg {
             })
         };
 
-        // We can't reuse connection establishing logic from `tokio_postgres` here,
+        // We can't reuse connection establishing logic from `postgres_client` here,
         // because it has no means for extracting the underlying socket which we
         // require for our business.
         let mut connection_error = None;
@@ -272,7 +272,7 @@ type RustlsStream = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>
 pub(crate) struct PostgresConnection {
     /// Socket connected to a compute node.
     pub(crate) stream:
-        tokio_postgres::maybe_tls_stream::MaybeTlsStream<tokio::net::TcpStream, RustlsStream>,
+        postgres_client::maybe_tls_stream::MaybeTlsStream<tokio::net::TcpStream, RustlsStream>,
     /// PostgreSQL connection parameters.
     pub(crate) params: std::collections::HashMap<String, String>,
     /// Query cancellation token.
diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs
index 9537d717a1..4d55f96ca1 100644
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -5,7 +5,6 @@ use std::sync::Arc;
 
 use futures::TryFutureExt;
 use thiserror::Error;
-use tokio_postgres::config::SslMode;
 use tokio_postgres::Client;
 use tracing::{error, info, info_span, warn, Instrument};
 
@@ -165,7 +164,7 @@ impl MockControlPlane {
         config
             .host(self.endpoint.host_str().unwrap_or("localhost"))
             .port(self.endpoint.port().unwrap_or(5432))
-            .ssl_mode(SslMode::Disable);
+            .ssl_mode(postgres_client::config::SslMode::Disable);
 
         let node = NodeInfo {
             config,
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 2cad981d01..5a78ec9d32 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -6,8 +6,8 @@ use std::time::Duration;
 use ::http::header::AUTHORIZATION;
 use ::http::HeaderName;
 use futures::TryFutureExt;
+use postgres_client::config::SslMode;
 use tokio::time::Instant;
-use tokio_postgres::config::SslMode;
 use tracing::{debug, info, info_span, warn, Instrument};
 
 use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeCompute};
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index 2221aac407..6a379499dc 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -84,7 +84,7 @@ pub(crate) trait ReportableError: fmt::Display + Send + 'static {
     fn get_error_kind(&self) -> ErrorKind;
 }
 
-impl ReportableError for tokio_postgres::error::Error {
+impl ReportableError for postgres_client::error::Error {
     fn get_error_kind(&self) -> ErrorKind {
         if self.as_db_error().is_some() {
             ErrorKind::Postgres
diff --git a/proxy/src/postgres_rustls/mod.rs b/proxy/src/postgres_rustls/mod.rs
index 31e7915e89..5ef20991c3 100644
--- a/proxy/src/postgres_rustls/mod.rs
+++ b/proxy/src/postgres_rustls/mod.rs
@@ -1,10 +1,10 @@
 use std::convert::TryFrom;
 use std::sync::Arc;
 
+use postgres_client::tls::MakeTlsConnect;
 use rustls::pki_types::ServerName;
 use rustls::ClientConfig;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_postgres::tls::MakeTlsConnect;
 
 mod private {
     use std::future::Future;
@@ -12,9 +12,9 @@ mod private {
     use std::pin::Pin;
     use std::task::{Context, Poll};
 
+    use postgres_client::tls::{ChannelBinding, TlsConnect};
     use rustls::pki_types::ServerName;
     use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
-    use tokio_postgres::tls::{ChannelBinding, TlsConnect};
     use tokio_rustls::client::TlsStream;
     use tokio_rustls::TlsConnector;
 
@@ -59,7 +59,7 @@ mod private {
 
     pub struct RustlsStream<S>(TlsStream<S>);
 
-    impl<S> tokio_postgres::tls::TlsStream for RustlsStream<S>
+    impl<S> postgres_client::tls::TlsStream for RustlsStream<S>
     where
         S: AsyncRead + AsyncWrite + Unpin,
     {
diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs
index d3f0c3e7d4..42d1491782 100644
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -31,9 +31,9 @@ impl CouldRetry for io::Error {
     }
 }
 
-impl CouldRetry for tokio_postgres::error::DbError {
+impl CouldRetry for postgres_client::error::DbError {
     fn could_retry(&self) -> bool {
-        use tokio_postgres::error::SqlState;
+        use postgres_client::error::SqlState;
         matches!(
             self.code(),
             &SqlState::CONNECTION_FAILURE
@@ -43,9 +43,9 @@ impl CouldRetry for tokio_postgres::error::DbError {
         )
     }
 }
-impl ShouldRetryWakeCompute for tokio_postgres::error::DbError {
+impl ShouldRetryWakeCompute for postgres_client::error::DbError {
     fn should_retry_wake_compute(&self) -> bool {
-        use tokio_postgres::error::SqlState;
+        use postgres_client::error::SqlState;
         // Here are errors that happens after the user successfully authenticated to the database.
         // TODO: there are pgbouncer errors that should be retried, but they are not listed here.
         !matches!(
@@ -61,21 +61,21 @@ impl ShouldRetryWakeCompute for tokio_postgres::error::DbError {
     }
 }
 
-impl CouldRetry for tokio_postgres::Error {
+impl CouldRetry for postgres_client::Error {
     fn could_retry(&self) -> bool {
         if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
             io::Error::could_retry(io_err)
         } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
-            tokio_postgres::error::DbError::could_retry(db_err)
+            postgres_client::error::DbError::could_retry(db_err)
         } else {
             false
         }
     }
 }
-impl ShouldRetryWakeCompute for tokio_postgres::Error {
+impl ShouldRetryWakeCompute for postgres_client::Error {
     fn should_retry_wake_compute(&self) -> bool {
         if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
-            tokio_postgres::error::DbError::should_retry_wake_compute(db_err)
+            postgres_client::error::DbError::should_retry_wake_compute(db_err)
         } else {
             // likely an IO error. Possible the compute has shutdown and the
             // cache is stale.
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index fe211adfeb..ef351f3b54 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -8,9 +8,9 @@ use std::fmt::Debug;
 
 use bytes::{Bytes, BytesMut};
 use futures::{SinkExt, StreamExt};
+use postgres_client::tls::TlsConnect;
 use postgres_protocol::message::frontend;
 use tokio::io::{AsyncReadExt, DuplexStream};
-use tokio_postgres::tls::TlsConnect;
 use tokio_util::codec::{Decoder, Encoder};
 
 use super::*;
@@ -158,8 +158,8 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
         Scram::new("password").await?,
     ));
 
-    let _client_err = tokio_postgres::Config::new()
-        .channel_binding(tokio_postgres::config::ChannelBinding::Disable)
+    let _client_err = postgres_client::Config::new()
+        .channel_binding(postgres_client::config::ChannelBinding::Disable)
         .user("user")
         .dbname("db")
         .password("password")
@@ -175,7 +175,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
 async fn scram_auth_prefer_channel_binding() -> anyhow::Result<()> {
     connect_failure(
         Intercept::None,
-        tokio_postgres::config::ChannelBinding::Prefer,
+        postgres_client::config::ChannelBinding::Prefer,
     )
     .await
 }
@@ -185,7 +185,7 @@ async fn scram_auth_prefer_channel_binding() -> anyhow::Result<()> {
 async fn scram_auth_prefer_channel_binding_intercept() -> anyhow::Result<()> {
     connect_failure(
         Intercept::Methods,
-        tokio_postgres::config::ChannelBinding::Prefer,
+        postgres_client::config::ChannelBinding::Prefer,
     )
     .await
 }
@@ -195,7 +195,7 @@ async fn scram_auth_prefer_channel_binding_intercept() -> anyhow::Result<()> {
 async fn scram_auth_prefer_channel_binding_intercept_response() -> anyhow::Result<()> {
     connect_failure(
         Intercept::SASLResponse,
-        tokio_postgres::config::ChannelBinding::Prefer,
+        postgres_client::config::ChannelBinding::Prefer,
     )
     .await
 }
@@ -205,7 +205,7 @@ async fn scram_auth_prefer_channel_binding_intercept_response() -> anyhow::Resul
 async fn scram_auth_require_channel_binding() -> anyhow::Result<()> {
     connect_failure(
         Intercept::None,
-        tokio_postgres::config::ChannelBinding::Require,
+        postgres_client::config::ChannelBinding::Require,
     )
     .await
 }
@@ -215,7 +215,7 @@ async fn scram_auth_require_channel_binding() -> anyhow::Result<()> {
 async fn scram_auth_require_channel_binding_intercept() -> anyhow::Result<()> {
     connect_failure(
         Intercept::Methods,
-        tokio_postgres::config::ChannelBinding::Require,
+        postgres_client::config::ChannelBinding::Require,
     )
     .await
 }
@@ -225,14 +225,14 @@ async fn scram_auth_require_channel_binding_intercept() -> anyhow::Result<()> {
 async fn scram_auth_require_channel_binding_intercept_response() -> anyhow::Result<()> {
     connect_failure(
         Intercept::SASLResponse,
-        tokio_postgres::config::ChannelBinding::Require,
+        postgres_client::config::ChannelBinding::Require,
     )
     .await
 }
 
 async fn connect_failure(
     intercept: Intercept,
-    channel_binding: tokio_postgres::config::ChannelBinding,
+    channel_binding: postgres_client::config::ChannelBinding,
 ) -> anyhow::Result<()> {
     let (server, client, client_config, server_config) = proxy_mitm(intercept).await;
     let proxy = tokio::spawn(dummy_proxy(
@@ -241,7 +241,7 @@ async fn connect_failure(
         Scram::new("password").await?,
     ));
 
-    let _client_err = tokio_postgres::Config::new()
+    let _client_err = postgres_client::Config::new()
         .channel_binding(channel_binding)
         .user("user")
         .dbname("db")
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 15be6c9724..c8b742b3ff 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -7,13 +7,13 @@ use std::time::Duration;
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use http::StatusCode;
+use postgres_client::config::SslMode;
+use postgres_client::tls::{MakeTlsConnect, NoTls};
 use retry::{retry_after, ShouldRetryWakeCompute};
 use rstest::rstest;
 use rustls::crypto::ring;
 use rustls::pki_types;
 use tokio::io::DuplexStream;
-use tokio_postgres::config::SslMode;
-use tokio_postgres::tls::{MakeTlsConnect, NoTls};
 
 use super::connect_compute::ConnectMechanism;
 use super::retry::CouldRetry;
@@ -204,7 +204,7 @@ async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> {
     let (_, server_config) = generate_tls_config("generic-project-name.localhost", "localhost")?;
     let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
 
-    let client_err = tokio_postgres::Config::new()
+    let client_err = postgres_client::Config::new()
         .user("john_doe")
         .dbname("earth")
         .ssl_mode(SslMode::Disable)
@@ -233,7 +233,7 @@ async fn handshake_tls() -> anyhow::Result<()> {
         generate_tls_config("generic-project-name.localhost", "localhost")?;
     let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
 
-    let _conn = tokio_postgres::Config::new()
+    let _conn = postgres_client::Config::new()
         .user("john_doe")
         .dbname("earth")
         .ssl_mode(SslMode::Require)
@@ -249,7 +249,7 @@ async fn handshake_raw() -> anyhow::Result<()> {
 
     let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth));
 
-    let _conn = tokio_postgres::Config::new()
+    let _conn = postgres_client::Config::new()
         .user("john_doe")
         .dbname("earth")
         .options("project=generic-project-name")
@@ -296,8 +296,8 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
         Scram::new(password).await?,
     ));
 
-    let _conn = tokio_postgres::Config::new()
-        .channel_binding(tokio_postgres::config::ChannelBinding::Require)
+    let _conn = postgres_client::Config::new()
+        .channel_binding(postgres_client::config::ChannelBinding::Require)
         .user("user")
         .dbname("db")
         .password(password)
@@ -320,8 +320,8 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
         Scram::new("password").await?,
     ));
 
-    let _conn = tokio_postgres::Config::new()
-        .channel_binding(tokio_postgres::config::ChannelBinding::Disable)
+    let _conn = postgres_client::Config::new()
+        .channel_binding(postgres_client::config::ChannelBinding::Disable)
         .user("user")
         .dbname("db")
         .password("password")
@@ -348,7 +348,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
         .map(char::from)
         .collect();
 
-    let _client_err = tokio_postgres::Config::new()
+    let _client_err = postgres_client::Config::new()
         .user("user")
         .dbname("db")
         .password(&password) // no password will match the mocked secret
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 57846a4c2c..8c7931907d 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -37,9 +37,9 @@ use crate::types::{EndpointId, Host, LOCAL_PROXY_SUFFIX};
 
 pub(crate) struct PoolingBackend {
     pub(crate) http_conn_pool: Arc<GlobalConnPool<Send, HttpConnPool<Send>>>,
-    pub(crate) local_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
+    pub(crate) local_pool: Arc<LocalConnPool<postgres_client::Client>>,
     pub(crate) pool:
-        Arc<GlobalConnPool<tokio_postgres::Client, EndpointConnPool<tokio_postgres::Client>>>,
+        Arc<GlobalConnPool<postgres_client::Client, EndpointConnPool<postgres_client::Client>>>,
 
     pub(crate) config: &'static ProxyConfig,
     pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>,
@@ -170,7 +170,7 @@ impl PoolingBackend {
         conn_info: ConnInfo,
         keys: ComputeCredentials,
         force_new: bool,
-    ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
+    ) -> Result<Client<postgres_client::Client>, HttpConnError> {
         let maybe_client = if force_new {
             debug!("pool: pool is disabled");
             None
@@ -256,7 +256,7 @@ impl PoolingBackend {
         &self,
         ctx: &RequestContext,
         conn_info: ConnInfo,
-    ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
+    ) -> Result<Client<postgres_client::Client>, HttpConnError> {
         if let Some(client) = self.local_pool.get(ctx, &conn_info)? {
             return Ok(client);
         }
@@ -315,7 +315,7 @@ impl PoolingBackend {
             ));
 
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
-        let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
+        let (client, connection) = config.connect(postgres_client::NoTls).await?;
         drop(pause);
 
         let pid = client.get_process_id();
@@ -360,7 +360,7 @@ pub(crate) enum HttpConnError {
     #[error("pooled connection closed at inconsistent state")]
     ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError<uuid::Uuid>),
     #[error("could not connection to postgres in compute")]
-    PostgresConnectionError(#[from] tokio_postgres::Error),
+    PostgresConnectionError(#[from] postgres_client::Error),
     #[error("could not connection to local-proxy in compute")]
     LocalProxyConnectionError(#[from] LocalProxyConnError),
     #[error("could not parse JWT payload")]
@@ -479,7 +479,7 @@ impl ShouldRetryWakeCompute for LocalProxyConnError {
 }
 
 struct TokioMechanism {
-    pool: Arc<GlobalConnPool<tokio_postgres::Client, EndpointConnPool<tokio_postgres::Client>>>,
+    pool: Arc<GlobalConnPool<postgres_client::Client, EndpointConnPool<postgres_client::Client>>>,
     conn_info: ConnInfo,
     conn_id: uuid::Uuid,
 
@@ -489,7 +489,7 @@ struct TokioMechanism {
 
 #[async_trait]
 impl ConnectMechanism for TokioMechanism {
-    type Connection = Client<tokio_postgres::Client>;
+    type Connection = Client<postgres_client::Client>;
     type ConnectError = HttpConnError;
     type Error = HttpConnError;
 
@@ -509,7 +509,7 @@ impl ConnectMechanism for TokioMechanism {
             .connect_timeout(timeout);
 
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
-        let res = config.connect(tokio_postgres::NoTls).await;
+        let res = config.connect(postgres_client::NoTls).await;
         drop(pause);
         let (client, connection) = permit.release_result(res)?;
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index c302eac568..cac5a173cb 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -5,11 +5,11 @@ use std::task::{ready, Poll};
 
 use futures::future::poll_fn;
 use futures::Future;
+use postgres_client::tls::NoTlsStream;
+use postgres_client::AsyncMessage;
 use smallvec::SmallVec;
 use tokio::net::TcpStream;
 use tokio::time::Instant;
-use tokio_postgres::tls::NoTlsStream;
-use tokio_postgres::AsyncMessage;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
 #[cfg(test)]
@@ -58,7 +58,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     ctx: &RequestContext,
     conn_info: ConnInfo,
     client: C,
-    mut connection: tokio_postgres::Connection<TcpStream, NoTlsStream>,
+    mut connection: postgres_client::Connection<TcpStream, NoTlsStream>,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
 ) -> Client<C> {
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index fe1d2563bc..2a46c8f9c5 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -7,8 +7,8 @@ use std::time::Duration;
 
 use dashmap::DashMap;
 use parking_lot::RwLock;
+use postgres_client::ReadyForQueryStatus;
 use rand::Rng;
-use tokio_postgres::ReadyForQueryStatus;
 use tracing::{debug, info, Span};
 
 use super::backend::HttpConnError;
@@ -683,7 +683,7 @@ pub(crate) trait ClientInnerExt: Sync + Send + 'static {
     fn get_process_id(&self) -> i32;
 }
 
-impl ClientInnerExt for tokio_postgres::Client {
+impl ClientInnerExt for postgres_client::Client {
     fn is_closed(&self) -> bool {
         self.is_closed()
     }
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index 569e2da571..25b25c66d3 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -1,6 +1,6 @@
+use postgres_client::types::{Kind, Type};
+use postgres_client::Row;
 use serde_json::{Map, Value};
-use tokio_postgres::types::{Kind, Type};
-use tokio_postgres::Row;
 
 //
 // Convert json non-string types to strings, so that they can be passed to Postgres
@@ -61,7 +61,7 @@ fn json_array_to_pg_array(value: &Value) -> Option<String> {
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum JsonConversionError {
     #[error("internal error compute returned invalid data: {0}")]
-    AsTextError(tokio_postgres::Error),
+    AsTextError(postgres_client::Error),
     #[error("parse int error: {0}")]
     ParseIntError(#[from] std::num::ParseIntError),
     #[error("parse float error: {0}")]
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index db9ac49dae..b84cde9e25 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -22,13 +22,13 @@ use indexmap::IndexMap;
 use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
 use p256::ecdsa::{Signature, SigningKey};
 use parking_lot::RwLock;
+use postgres_client::tls::NoTlsStream;
+use postgres_client::types::ToSql;
+use postgres_client::AsyncMessage;
 use serde_json::value::RawValue;
 use signature::Signer;
 use tokio::net::TcpStream;
 use tokio::time::Instant;
-use tokio_postgres::tls::NoTlsStream;
-use tokio_postgres::types::ToSql;
-use tokio_postgres::AsyncMessage;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, info_span, warn, Instrument};
 
@@ -164,7 +164,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     ctx: &RequestContext,
     conn_info: ConnInfo,
     client: C,
-    mut connection: tokio_postgres::Connection<TcpStream, NoTlsStream>,
+    mut connection: postgres_client::Connection<TcpStream, NoTlsStream>,
     key: SigningKey,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
@@ -280,7 +280,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     )
 }
 
-impl ClientInnerCommon<tokio_postgres::Client> {
+impl ClientInnerCommon<postgres_client::Client> {
     pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> {
         if let ClientDataEnum::Local(local_data) = &mut self.data {
             local_data.jti += 1;
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index a0ca7cc60d..5e85f5ec40 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -11,12 +11,12 @@ use http_body_util::{BodyExt, Full};
 use hyper::body::Incoming;
 use hyper::http::{HeaderName, HeaderValue};
 use hyper::{header, HeaderMap, Request, Response, StatusCode};
+use postgres_client::error::{DbError, ErrorPosition, SqlState};
+use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
 use pq_proto::StartupMessageParamsBuilder;
 use serde::Serialize;
 use serde_json::Value;
 use tokio::time::{self, Instant};
-use tokio_postgres::error::{DbError, ErrorPosition, SqlState};
-use tokio_postgres::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info};
 use typed_json::json;
@@ -361,7 +361,7 @@ pub(crate) enum SqlOverHttpError {
     #[error("invalid isolation level")]
     InvalidIsolationLevel,
     #[error("{0}")]
-    Postgres(#[from] tokio_postgres::Error),
+    Postgres(#[from] postgres_client::Error),
     #[error("{0}")]
     JsonConversion(#[from] JsonConversionError),
     #[error("{0}")]
@@ -986,7 +986,7 @@ async fn query_to_json<T: GenericClient>(
     // Manually drain the stream into a vector to leave row_stream hanging
     // around to get a command tag. Also check that the response is not too
     // big.
-    let mut rows: Vec<tokio_postgres::Row> = Vec::new();
+    let mut rows: Vec<postgres_client::Row> = Vec::new();
     while let Some(row) = row_stream.next().await {
         let row = row?;
         *current_size += row.body_len();
@@ -1063,13 +1063,13 @@ async fn query_to_json<T: GenericClient>(
 }
 
 enum Client {
-    Remote(conn_pool_lib::Client<tokio_postgres::Client>),
-    Local(conn_pool_lib::Client<tokio_postgres::Client>),
+    Remote(conn_pool_lib::Client<postgres_client::Client>),
+    Local(conn_pool_lib::Client<postgres_client::Client>),
 }
 
 enum Discard<'a> {
-    Remote(conn_pool_lib::Discard<'a, tokio_postgres::Client>),
-    Local(conn_pool_lib::Discard<'a, tokio_postgres::Client>),
+    Remote(conn_pool_lib::Discard<'a, postgres_client::Client>),
+    Local(conn_pool_lib::Discard<'a, postgres_client::Client>),
 }
 
 impl Client {
@@ -1080,7 +1080,7 @@ impl Client {
         }
     }
 
-    fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) {
+    fn inner(&mut self) -> (&mut postgres_client::Client, Discard<'_>) {
         match self {
             Client::Remote(client) => {
                 let (c, d) = client.inner();

From 4157eaf4c56ed066f31512a82ea0a54689ba6648 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 3 Dec 2024 19:47:17 +0100
Subject: [PATCH 174/209] pageserver: respond to multiple shutdown signals
 (#9982)

## Problem

The Pageserver signal handler would only respond to a single signal and
initiate shutdown. Subsequent signals were ignored. This meant that a
`SIGQUIT` sent after a `SIGTERM` had no effect (e.g. in the case of a
slow or stalled shutdown). The `test_runner` uses this to force shutdown
if graceful shutdown is slow.

Touches #9740.

## Summary of changes

Keep responding to signals after the initial shutdown signal has been
received.

Arguably, the `test_runner` should also use `SIGKILL` rather than
`SIGQUIT` in this case, but it seems reasonable to respond to `SIGQUIT`
regardless.
---
 pageserver/src/bin/pageserver.rs | 76 +++++++++++++++++++-------------
 1 file changed, 45 insertions(+), 31 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 8fe225c6aa..567a69da3b 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -636,45 +636,59 @@ fn start_pageserver(
         tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
     });
 
-    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
-
     // All started up! Now just sit and wait for shutdown signal.
+    BACKGROUND_RUNTIME.block_on(async move {
+        let signal_token = CancellationToken::new();
+        let signal_cancel = signal_token.child_token();
 
-    {
-        BACKGROUND_RUNTIME.block_on(async move {
+        // Spawn signal handlers. Runs in a loop since we want to be responsive to multiple signals
+        // even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown). See:
+        // https://github.com/neondatabase/neon/issues/9740.
+        tokio::spawn(async move {
             let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
             let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
             let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
-            let signal = tokio::select! {
-                _ = sigquit.recv() => {
-                    info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
-                    std::process::exit(111);
+
+            loop {
+                let signal = tokio::select! {
+                    _ = sigquit.recv() => {
+                        info!("Got signal SIGQUIT. Terminating in immediate shutdown mode.");
+                        std::process::exit(111);
+                    }
+                    _ = sigint.recv() => "SIGINT",
+                    _ = sigterm.recv() => "SIGTERM",
+                };
+
+                if !signal_token.is_cancelled() {
+                    info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
+                    signal_token.cancel();
+                } else {
+                    info!("Got signal {signal}. Already shutting down.");
                 }
-                _ = sigint.recv() => { "SIGINT" },
-                _ = sigterm.recv() => { "SIGTERM" },
-            };
+            }
+        });
 
-            info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
+        // Wait for cancellation signal and shut down the pageserver.
+        //
+        // This cancels the `shutdown_pageserver` cancellation tree. Right now that tree doesn't
+        // reach very far, and `task_mgr` is used instead. The plan is to change that over time.
+        signal_cancel.cancelled().await;
 
-            // This cancels the `shutdown_pageserver` cancellation tree.
-            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
-            // The plan is to change that over time.
-            shutdown_pageserver.take();
-            pageserver::shutdown_pageserver(
-                http_endpoint_listener,
-                page_service,
-                consumption_metrics_tasks,
-                disk_usage_eviction_task,
-                &tenant_manager,
-                background_purges,
-                deletion_queue.clone(),
-                secondary_controller_tasks,
-                0,
-            )
-            .await;
-            unreachable!()
-        })
-    }
+        shutdown_pageserver.cancel();
+        pageserver::shutdown_pageserver(
+            http_endpoint_listener,
+            page_service,
+            consumption_metrics_tasks,
+            disk_usage_eviction_task,
+            &tenant_manager,
+            background_purges,
+            deletion_queue.clone(),
+            secondary_controller_tasks,
+            0,
+        )
+        .await;
+        unreachable!();
+    })
 }
 
 async fn create_remote_storage_client(

From 5519e4261206ec5ba7caf243e176b948e2605ca5 Mon Sep 17 00:00:00 2001
From: Alexey Immoreev <lexx92@mail.ru>
Date: Tue, 3 Dec 2024 22:59:44 +0400
Subject: [PATCH 175/209] Improvement: add console redirect timeout warning
 (#9985)

## Problem

There is no information on session being cancelled in 2 minutes at the
moment

## Summary of changes

The timeout being logged for the user
---
 proxy/src/auth/backend/console_redirect.rs | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 494564de05..619c7b4ef1 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -49,13 +49,19 @@ impl ReportableError for ConsoleRedirectError {
     }
 }
 
-fn hello_message(redirect_uri: &reqwest::Url, session_id: &str) -> String {
+fn hello_message(
+    redirect_uri: &reqwest::Url,
+    session_id: &str,
+    duration: std::time::Duration,
+) -> String {
+    let formatted_duration = humantime::format_duration(duration).to_string();
     format!(
         concat![
             "Welcome to Neon!\n",
-            "Authenticate by visiting:\n",
+            "Authenticate by visiting (will expire in {duration}):\n",
             "    {redirect_uri}{session_id}\n\n",
         ],
+        duration = formatted_duration,
         redirect_uri = redirect_uri,
         session_id = session_id,
     )
@@ -118,7 +124,11 @@ async fn authenticate(
     };
 
     let span = info_span!("console_redirect", psql_session_id = &psql_session_id);
-    let greeting = hello_message(link_uri, &psql_session_id);
+    let greeting = hello_message(
+        link_uri,
+        &psql_session_id,
+        auth_config.console_redirect_confirmation_timeout,
+    );
 
     // Give user a URL to spawn a new database.
     info!(parent: &span, "sending the auth URL to the user");

From fbc8c36983122801ee9f12772c3062dda22ee915 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 3 Dec 2024 20:00:14 +0000
Subject: [PATCH 176/209] chore(proxy): enforce single host+port (#9995)

proxy doesn't ever provide multiple hosts/ports, so this code adds a lot
of complexity of error handling for no good reason.

(stacked on #9990)
---
 libs/proxy/tokio-postgres2/src/config.rs   | 41 ++++-----------
 libs/proxy/tokio-postgres2/src/connect.rs  | 42 ++++-----------
 proxy/src/auth/backend/console_redirect.rs |  8 +--
 proxy/src/auth/backend/local.rs            |  7 +--
 proxy/src/compute.rs                       | 59 ++++++----------------
 proxy/src/control_plane/client/mock.rs     | 10 ++--
 proxy/src/control_plane/client/neon.rs     |  4 +-
 proxy/src/proxy/connect_compute.rs         |  2 +-
 proxy/src/proxy/tests/mitm.rs              |  4 +-
 proxy/src/proxy/tests/mod.rs               | 14 ++---
 proxy/src/serverless/backend.rs            | 10 ++--
 11 files changed, 59 insertions(+), 142 deletions(-)

diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
index 5dad835c3b..fd10ef6f20 100644
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -146,6 +146,9 @@ pub enum AuthKeys {
 /// ```
 #[derive(Clone, PartialEq, Eq)]
 pub struct Config {
+    pub(crate) host: Host,
+    pub(crate) port: u16,
+
     pub(crate) user: Option<String>,
     pub(crate) password: Option<Vec<u8>>,
     pub(crate) auth_keys: Option<Box<AuthKeys>>,
@@ -153,8 +156,6 @@ pub struct Config {
     pub(crate) options: Option<String>,
     pub(crate) application_name: Option<String>,
     pub(crate) ssl_mode: SslMode,
-    pub(crate) host: Vec<Host>,
-    pub(crate) port: Vec<u16>,
     pub(crate) connect_timeout: Option<Duration>,
     pub(crate) target_session_attrs: TargetSessionAttrs,
     pub(crate) channel_binding: ChannelBinding,
@@ -162,16 +163,12 @@ pub struct Config {
     pub(crate) max_backend_message_size: Option<usize>,
 }
 
-impl Default for Config {
-    fn default() -> Config {
-        Config::new()
-    }
-}
-
 impl Config {
     /// Creates a new configuration.
-    pub fn new() -> Config {
+    pub fn new(host: String, port: u16) -> Config {
         Config {
+            host: Host::Tcp(host),
+            port,
             user: None,
             password: None,
             auth_keys: None,
@@ -179,8 +176,6 @@ impl Config {
             options: None,
             application_name: None,
             ssl_mode: SslMode::Prefer,
-            host: vec![],
-            port: vec![],
             connect_timeout: None,
             target_session_attrs: TargetSessionAttrs::Any,
             channel_binding: ChannelBinding::Prefer,
@@ -283,32 +278,14 @@ impl Config {
         self.ssl_mode
     }
 
-    /// Adds a host to the configuration.
-    ///
-    /// Multiple hosts can be specified by calling this method multiple times, and each will be tried in order.
-    pub fn host(&mut self, host: &str) -> &mut Config {
-        self.host.push(Host::Tcp(host.to_string()));
-        self
-    }
-
     /// Gets the hosts that have been added to the configuration with `host`.
-    pub fn get_hosts(&self) -> &[Host] {
+    pub fn get_host(&self) -> &Host {
         &self.host
     }
 
-    /// Adds a port to the configuration.
-    ///
-    /// Multiple ports can be specified by calling this method multiple times. There must either be no ports, in which
-    /// case the default of 5432 is used, a single port, in which it is used for all hosts, or the same number of ports
-    /// as hosts.
-    pub fn port(&mut self, port: u16) -> &mut Config {
-        self.port.push(port);
-        self
-    }
-
     /// Gets the ports that have been added to the configuration with `port`.
-    pub fn get_ports(&self) -> &[u16] {
-        &self.port
+    pub fn get_port(&self) -> u16 {
+        self.port
     }
 
     /// Sets the timeout applied to socket-level connection attempts.
diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs
index 98067d91f9..75a58e6eac 100644
--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -19,38 +19,18 @@ pub async fn connect<T>(
 where
     T: MakeTlsConnect<TcpStream>,
 {
-    if config.host.is_empty() {
-        return Err(Error::config("host missing".into()));
+    let hostname = match &config.host {
+        Host::Tcp(host) => host.as_str(),
+    };
+
+    let tls = tls
+        .make_tls_connect(hostname)
+        .map_err(|e| Error::tls(e.into()))?;
+
+    match connect_once(&config.host, config.port, tls, config).await {
+        Ok((client, connection)) => Ok((client, connection)),
+        Err(e) => Err(e),
     }
-
-    if config.port.len() > 1 && config.port.len() != config.host.len() {
-        return Err(Error::config("invalid number of ports".into()));
-    }
-
-    let mut error = None;
-    for (i, host) in config.host.iter().enumerate() {
-        let port = config
-            .port
-            .get(i)
-            .or_else(|| config.port.first())
-            .copied()
-            .unwrap_or(5432);
-
-        let hostname = match host {
-            Host::Tcp(host) => host.as_str(),
-        };
-
-        let tls = tls
-            .make_tls_connect(hostname)
-            .map_err(|e| Error::tls(e.into()))?;
-
-        match connect_once(host, port, tls, config).await {
-            Ok((client, connection)) => return Ok((client, connection)),
-            Err(e) => error = Some(e),
-        }
-    }
-
-    Err(error.unwrap())
 }
 
 async fn connect_once<T>(
diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 619c7b4ef1..575d60be85 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -161,12 +161,8 @@ async fn authenticate(
 
     // This config should be self-contained, because we won't
     // take username or dbname from client's startup message.
-    let mut config = compute::ConnCfg::new();
-    config
-        .host(&db_info.host)
-        .port(db_info.port)
-        .dbname(&db_info.dbname)
-        .user(&db_info.user);
+    let mut config = compute::ConnCfg::new(db_info.host.to_string(), db_info.port);
+    config.dbname(&db_info.dbname).user(&db_info.user);
 
     ctx.set_dbname(db_info.dbname.into());
     ctx.set_user(db_info.user.into());
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index 32e0f53615..d4273fb521 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -29,12 +29,7 @@ impl LocalBackend {
                 api: http::Endpoint::new(compute_ctl, http::new_client()),
             },
             node_info: NodeInfo {
-                config: {
-                    let mut cfg = ConnCfg::new();
-                    cfg.host(&postgres_addr.ip().to_string());
-                    cfg.port(postgres_addr.port());
-                    cfg
-                },
+                config: ConnCfg::new(postgres_addr.ip().to_string(), postgres_addr.port()),
                 // TODO(conrad): make this better reflect compute info rather than endpoint info.
                 aux: MetricsAuxInfo {
                     endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"),
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 06bc71c559..ab0ff4b795 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -104,13 +104,13 @@ pub(crate) type ScramKeys = postgres_client::config::ScramKeys<32>;
 /// A config for establishing a connection to compute node.
 /// Eventually, `postgres_client` will be replaced with something better.
 /// Newtype allows us to implement methods on top of it.
-#[derive(Clone, Default)]
+#[derive(Clone)]
 pub(crate) struct ConnCfg(Box<postgres_client::Config>);
 
 /// Creation and initialization routines.
 impl ConnCfg {
-    pub(crate) fn new() -> Self {
-        Self::default()
+    pub(crate) fn new(host: String, port: u16) -> Self {
+        Self(Box::new(postgres_client::Config::new(host, port)))
     }
 
     /// Reuse password or auth keys from the other config.
@@ -124,13 +124,9 @@ impl ConnCfg {
         }
     }
 
-    pub(crate) fn get_host(&self) -> Result<Host, WakeComputeError> {
-        match self.0.get_hosts() {
-            [postgres_client::config::Host::Tcp(s)] => Ok(s.into()),
-            // we should not have multiple address or unix addresses.
-            _ => Err(WakeComputeError::BadComputeAddress(
-                "invalid compute address".into(),
-            )),
+    pub(crate) fn get_host(&self) -> Host {
+        match self.0.get_host() {
+            postgres_client::config::Host::Tcp(s) => s.into(),
         }
     }
 
@@ -227,43 +223,20 @@ impl ConnCfg {
         // We can't reuse connection establishing logic from `postgres_client` here,
         // because it has no means for extracting the underlying socket which we
         // require for our business.
-        let mut connection_error = None;
-        let ports = self.0.get_ports();
-        let hosts = self.0.get_hosts();
-        // the ports array is supposed to have 0 entries, 1 entry, or as many entries as in the hosts array
-        if ports.len() > 1 && ports.len() != hosts.len() {
-            return Err(io::Error::new(
-                io::ErrorKind::Other,
-                format!(
-                    "bad compute config, \
-                     ports and hosts entries' count does not match: {:?}",
-                    self.0
-                ),
-            ));
-        }
+        let port = self.0.get_port();
+        let host = self.0.get_host();
 
-        for (i, host) in hosts.iter().enumerate() {
-            let port = ports.get(i).or_else(|| ports.first()).unwrap_or(&5432);
-            let host = match host {
-                Host::Tcp(host) => host.as_str(),
-            };
+        let host = match host {
+            Host::Tcp(host) => host.as_str(),
+        };
 
-            match connect_once(host, *port).await {
-                Ok((sockaddr, stream)) => return Ok((sockaddr, stream, host)),
-                Err(err) => {
-                    // We can't throw an error here, as there might be more hosts to try.
-                    warn!("couldn't connect to compute node at {host}:{port}: {err}");
-                    connection_error = Some(err);
-                }
+        match connect_once(host, port).await {
+            Ok((sockaddr, stream)) => Ok((sockaddr, stream, host)),
+            Err(err) => {
+                warn!("couldn't connect to compute node at {host}:{port}: {err}");
+                Err(err)
             }
         }
-
-        Err(connection_error.unwrap_or_else(|| {
-            io::Error::new(
-                io::ErrorKind::Other,
-                format!("bad compute config: {:?}", self.0),
-            )
-        }))
     }
 }
 
diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs
index 4d55f96ca1..eaf692ab27 100644
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -160,11 +160,11 @@ impl MockControlPlane {
     }
 
     async fn do_wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
-        let mut config = compute::ConnCfg::new();
-        config
-            .host(self.endpoint.host_str().unwrap_or("localhost"))
-            .port(self.endpoint.port().unwrap_or(5432))
-            .ssl_mode(postgres_client::config::SslMode::Disable);
+        let mut config = compute::ConnCfg::new(
+            self.endpoint.host_str().unwrap_or("localhost").to_owned(),
+            self.endpoint.port().unwrap_or(5432),
+        );
+        config.ssl_mode(postgres_client::config::SslMode::Disable);
 
         let node = NodeInfo {
             config,
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 5a78ec9d32..5c204ae1d7 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -241,8 +241,8 @@ impl NeonControlPlaneClient {
             // Don't set anything but host and port! This config will be cached.
             // We'll set username and such later using the startup message.
             // TODO: add more type safety (in progress).
-            let mut config = compute::ConnCfg::new();
-            config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
+            let mut config = compute::ConnCfg::new(host.to_owned(), port);
+            config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
 
             let node = NodeInfo {
                 config,
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 2e759b0894..585dce7bae 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -86,7 +86,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
         node_info: &control_plane::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
-        let host = node_info.config.get_host()?;
+        let host = node_info.config.get_host();
         let permit = self.locks.get_permit(&host).await?;
         permit.release_result(node_info.connect(ctx, timeout).await)
     }
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index ef351f3b54..d72331c7bf 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -158,7 +158,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
         Scram::new("password").await?,
     ));
 
-    let _client_err = postgres_client::Config::new()
+    let _client_err = postgres_client::Config::new("test".to_owned(), 5432)
         .channel_binding(postgres_client::config::ChannelBinding::Disable)
         .user("user")
         .dbname("db")
@@ -241,7 +241,7 @@ async fn connect_failure(
         Scram::new("password").await?,
     ));
 
-    let _client_err = postgres_client::Config::new()
+    let _client_err = postgres_client::Config::new("test".to_owned(), 5432)
         .channel_binding(channel_binding)
         .user("user")
         .dbname("db")
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index c8b742b3ff..53345431e3 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -204,7 +204,7 @@ async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> {
     let (_, server_config) = generate_tls_config("generic-project-name.localhost", "localhost")?;
     let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
 
-    let client_err = postgres_client::Config::new()
+    let client_err = postgres_client::Config::new("test".to_owned(), 5432)
         .user("john_doe")
         .dbname("earth")
         .ssl_mode(SslMode::Disable)
@@ -233,7 +233,7 @@ async fn handshake_tls() -> anyhow::Result<()> {
         generate_tls_config("generic-project-name.localhost", "localhost")?;
     let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
 
-    let _conn = postgres_client::Config::new()
+    let _conn = postgres_client::Config::new("test".to_owned(), 5432)
         .user("john_doe")
         .dbname("earth")
         .ssl_mode(SslMode::Require)
@@ -249,7 +249,7 @@ async fn handshake_raw() -> anyhow::Result<()> {
 
     let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth));
 
-    let _conn = postgres_client::Config::new()
+    let _conn = postgres_client::Config::new("test".to_owned(), 5432)
         .user("john_doe")
         .dbname("earth")
         .options("project=generic-project-name")
@@ -296,7 +296,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
         Scram::new(password).await?,
     ));
 
-    let _conn = postgres_client::Config::new()
+    let _conn = postgres_client::Config::new("test".to_owned(), 5432)
         .channel_binding(postgres_client::config::ChannelBinding::Require)
         .user("user")
         .dbname("db")
@@ -320,7 +320,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
         Scram::new("password").await?,
     ));
 
-    let _conn = postgres_client::Config::new()
+    let _conn = postgres_client::Config::new("test".to_owned(), 5432)
         .channel_binding(postgres_client::config::ChannelBinding::Disable)
         .user("user")
         .dbname("db")
@@ -348,7 +348,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
         .map(char::from)
         .collect();
 
-    let _client_err = postgres_client::Config::new()
+    let _client_err = postgres_client::Config::new("test".to_owned(), 5432)
         .user("user")
         .dbname("db")
         .password(&password) // no password will match the mocked secret
@@ -546,7 +546,7 @@ impl TestControlPlaneClient for TestConnectMechanism {
 
 fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
     let node = NodeInfo {
-        config: compute::ConnCfg::new(),
+        config: compute::ConnCfg::new("test".to_owned(), 5432),
         aux: MetricsAuxInfo {
             endpoint_id: (&EndpointId::from("endpoint")).into(),
             project_id: (&ProjectId::from("project")).into(),
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 8c7931907d..55d2e47fd3 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -499,7 +499,7 @@ impl ConnectMechanism for TokioMechanism {
         node_info: &CachedNodeInfo,
         timeout: Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
-        let host = node_info.config.get_host()?;
+        let host = node_info.config.get_host();
         let permit = self.locks.get_permit(&host).await?;
 
         let mut config = (*node_info.config).clone();
@@ -549,16 +549,12 @@ impl ConnectMechanism for HyperMechanism {
         node_info: &CachedNodeInfo,
         timeout: Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
-        let host = node_info.config.get_host()?;
+        let host = node_info.config.get_host();
         let permit = self.locks.get_permit(&host).await?;
 
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
 
-        let port = *node_info.config.get_ports().first().ok_or_else(|| {
-            HttpConnError::WakeCompute(WakeComputeError::BadComputeAddress(
-                "local-proxy port missing on compute address".into(),
-            ))
-        })?;
+        let port = node_info.config.get_port();
         let res = connect_http2(&host, port, timeout).await;
         drop(pause);
         let (client, connection) = permit.release_result(res)?;

From 718645e56cc2a1527c28620fe02701267cd96b9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 3 Dec 2024 21:39:10 +0100
Subject: [PATCH 177/209] Support tenant manifests in the scrubber (#9942)

Support tenant manifests in the storage scrubber:

* list the manifests, order them by generation
* delete all manifests except for the two most recent generations
* for the latest manifest: try parsing it.

I've tested this patch by running the against a staging bucket and it
successfully deleted stuff (and avoided deleting the latest two
generations).

In follow-up work, we might want to also check some invariants of the
manifest, as mentioned in #8088.

Part of #9386
Part of #8088

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 .../src/tenant/remote_timeline_client.rs      |   4 +-
 .../tenant/remote_timeline_client/manifest.rs |   2 +-
 storage_scrubber/src/checks.rs                | 135 ++++++++-
 .../src/pageserver_physical_gc.rs             | 258 ++++++++++++++----
 test_runner/regress/test_timeline_archive.py  | 114 ++++++++
 5 files changed, 459 insertions(+), 54 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 007bd3eef0..4bb1bbf3cf 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2564,9 +2564,9 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
 }
 
 /// Given the key of a tenant manifest, parse out the generation number
-pub(crate) fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option<Generation> {
+pub fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option<Generation> {
     static RE: OnceLock<Regex> = OnceLock::new();
-    let re = RE.get_or_init(|| Regex::new(r".+tenant-manifest-([0-9a-f]{8}).json").unwrap());
+    let re = RE.get_or_init(|| Regex::new(r".*tenant-manifest-([0-9a-f]{8}).json").unwrap());
     re.captures(path.get_path().as_str())
         .and_then(|c| c.get(1))
         .and_then(|m| Generation::parse_suffix(m.as_str()))
diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs
index c4382cb648..2029847a12 100644
--- a/pageserver/src/tenant/remote_timeline_client/manifest.rs
+++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs
@@ -43,7 +43,7 @@ impl TenantManifest {
             offloaded_timelines: vec![],
         }
     }
-    pub(crate) fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
+    pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
         serde_json::from_slice::<Self>(bytes)
     }
 
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 8d855d263c..1b4ff01a17 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -4,17 +4,21 @@ use itertools::Itertools;
 use pageserver::tenant::checks::check_valid_layermap;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::remote_timeline_client::manifest::TenantManifest;
 use pageserver_api::shard::ShardIndex;
 use tokio_util::sync::CancellationToken;
 use tracing::{info, warn};
 use utils::generation::Generation;
 use utils::id::TimelineId;
+use utils::shard::TenantShardId;
 
 use crate::cloud_admin_api::BranchData;
 use crate::metadata_stream::stream_listing;
 use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
 use futures_util::StreamExt;
-use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
+use pageserver::tenant::remote_timeline_client::{
+    parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path,
+};
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
@@ -527,3 +531,132 @@ async fn list_timeline_blobs_impl(
         unknown_keys,
     }))
 }
+
+pub(crate) struct RemoteTenantManifestInfo {
+    pub(crate) latest_generation: Option<Generation>,
+    pub(crate) manifests: Vec<(Generation, ListingObject)>,
+}
+
+pub(crate) enum ListTenantManifestResult {
+    WithErrors {
+        errors: Vec<(String, String)>,
+        #[allow(dead_code)]
+        unknown_keys: Vec<ListingObject>,
+    },
+    NoErrors(RemoteTenantManifestInfo),
+}
+
+/// Lists the tenant manifests in remote storage and parses the latest one, returning a [`ListTenantManifestResult`] object.
+pub(crate) async fn list_tenant_manifests(
+    remote_client: &GenericRemoteStorage,
+    tenant_id: TenantShardId,
+    root_target: &RootTarget,
+) -> anyhow::Result<ListTenantManifestResult> {
+    let mut errors = Vec::new();
+    let mut unknown_keys = Vec::new();
+
+    let mut tenant_root_target = root_target.tenant_root(&tenant_id);
+    let original_prefix = tenant_root_target.prefix_in_bucket.clone();
+    const TENANT_MANIFEST_STEM: &str = "tenant-manifest";
+    tenant_root_target.prefix_in_bucket += TENANT_MANIFEST_STEM;
+    tenant_root_target.delimiter = String::new();
+
+    let mut manifests: Vec<(Generation, ListingObject)> = Vec::new();
+
+    let prefix_str = &original_prefix
+        .strip_prefix("/")
+        .unwrap_or(&original_prefix);
+
+    let mut stream = std::pin::pin!(stream_listing(remote_client, &tenant_root_target));
+    'outer: while let Some(obj) = stream.next().await {
+        let (key, Some(obj)) = obj? else {
+            panic!("ListingObject not specified");
+        };
+
+        'err: {
+            // TODO a let chain would be nicer here.
+            let Some(name) = key.object_name() else {
+                break 'err;
+            };
+            if !name.starts_with(TENANT_MANIFEST_STEM) {
+                break 'err;
+            }
+            let Some(generation) = parse_remote_tenant_manifest_path(key.clone()) else {
+                break 'err;
+            };
+            tracing::debug!("tenant manifest {key}");
+            manifests.push((generation, obj));
+            continue 'outer;
+        }
+        tracing::info!("Listed an unknown key: {key}");
+        unknown_keys.push(obj);
+    }
+
+    if manifests.is_empty() {
+        tracing::debug!("No manifest for timeline.");
+
+        return Ok(ListTenantManifestResult::WithErrors {
+            errors,
+            unknown_keys,
+        });
+    }
+    if !unknown_keys.is_empty() {
+        errors.push(((*prefix_str).to_owned(), "unknown keys listed".to_string()));
+
+        return Ok(ListTenantManifestResult::WithErrors {
+            errors,
+            unknown_keys,
+        });
+    }
+
+    // Find the manifest with the highest generation
+    let (latest_generation, latest_listing_object) = manifests
+        .iter()
+        .max_by_key(|i| i.0)
+        .map(|(g, obj)| (*g, obj.clone()))
+        .unwrap();
+
+    let manifest_bytes =
+        match download_object_with_retries(remote_client, &latest_listing_object.key).await {
+            Ok(bytes) => bytes,
+            Err(e) => {
+                // It is possible that the tenant gets deleted in-between we list the objects
+                // and we download the manifest file.
+                errors.push((
+                    latest_listing_object.key.get_path().as_str().to_owned(),
+                    format!("failed to download tenant-manifest.json: {e}"),
+                ));
+                return Ok(ListTenantManifestResult::WithErrors {
+                    errors,
+                    unknown_keys,
+                });
+            }
+        };
+
+    match TenantManifest::from_json_bytes(&manifest_bytes) {
+        Ok(_manifest) => {
+            return Ok(ListTenantManifestResult::NoErrors(
+                RemoteTenantManifestInfo {
+                    latest_generation: Some(latest_generation),
+                    manifests,
+                },
+            ));
+        }
+        Err(parse_error) => errors.push((
+            latest_listing_object.key.get_path().as_str().to_owned(),
+            format!("tenant-manifest.json body parsing error: {parse_error}"),
+        )),
+    }
+
+    if errors.is_empty() {
+        errors.push((
+            (*prefix_str).to_owned(),
+            "Unexpected: no errors did not lead to a successfully parsed blob return".to_string(),
+        ));
+    }
+
+    Ok(ListTenantManifestResult::WithErrors {
+        errors,
+        unknown_keys,
+    })
+}
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 1e69ddbf15..20cb9c3633 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -2,12 +2,16 @@ use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::sync::Arc;
 use std::time::Duration;
 
-use crate::checks::{list_timeline_blobs, BlobDataParseResult};
+use crate::checks::{
+    list_tenant_manifests, list_timeline_blobs, BlobDataParseResult, ListTenantManifestResult,
+};
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
 use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
-use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
+use pageserver::tenant::remote_timeline_client::{
+    parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path,
+};
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use pageserver_api::controller_api::TenantDescribeResponse;
@@ -25,6 +29,7 @@ use utils::id::{TenantId, TenantTimelineId};
 #[derive(Serialize, Default)]
 pub struct GcSummary {
     indices_deleted: usize,
+    tenant_manifests_deleted: usize,
     remote_storage_errors: usize,
     controller_api_errors: usize,
     ancestor_layers_deleted: usize,
@@ -34,12 +39,14 @@ impl GcSummary {
     fn merge(&mut self, other: Self) {
         let Self {
             indices_deleted,
+            tenant_manifests_deleted,
             remote_storage_errors,
             ancestor_layers_deleted,
             controller_api_errors,
         } = other;
 
         self.indices_deleted += indices_deleted;
+        self.tenant_manifests_deleted += tenant_manifests_deleted;
         self.remote_storage_errors += remote_storage_errors;
         self.ancestor_layers_deleted += ancestor_layers_deleted;
         self.controller_api_errors += controller_api_errors;
@@ -352,6 +359,69 @@ async fn maybe_delete_index(
     }
 }
 
+async fn maybe_delete_tenant_manifest(
+    remote_client: &GenericRemoteStorage,
+    min_age: &Duration,
+    latest_gen: Generation,
+    obj: &ListingObject,
+    mode: GcMode,
+    summary: &mut GcSummary,
+) {
+    // Validation: we will only delete things that parse cleanly
+    let basename = obj.key.get_path().file_name().unwrap();
+    let Some(candidate_generation) =
+        parse_remote_tenant_manifest_path(RemotePath::from_string(basename).unwrap())
+    else {
+        // A strange key: we will not delete this because we don't understand it.
+        tracing::warn!("Bad index key");
+        return;
+    };
+
+    // Validation: we will only delete manifests more than one generation old, and in fact we
+    // should never be called with such recent generations.
+    if candidate_generation >= latest_gen {
+        tracing::warn!("Deletion candidate is >= latest generation, this is a bug!");
+        return;
+    } else if candidate_generation.next() == latest_gen {
+        tracing::warn!("Deletion candidate is >= latest generation - 1, this is a bug!");
+        return;
+    }
+
+    if !is_old_enough(min_age, obj, summary) {
+        return;
+    }
+
+    if matches!(mode, GcMode::DryRun) {
+        tracing::info!("Dry run: would delete this key");
+        return;
+    }
+
+    // All validations passed: erase the object
+    let cancel = CancellationToken::new();
+    match backoff::retry(
+        || remote_client.delete(&obj.key, &cancel),
+        |_| false,
+        3,
+        MAX_RETRIES as u32,
+        "maybe_delete_tenant_manifest",
+        &cancel,
+    )
+    .await
+    {
+        None => {
+            unreachable!("Using a dummy cancellation token");
+        }
+        Some(Ok(_)) => {
+            tracing::info!("Successfully deleted tenant manifest");
+            summary.tenant_manifests_deleted += 1;
+        }
+        Some(Err(e)) => {
+            tracing::warn!("Failed to delete tenant manifest: {e}");
+            summary.remote_storage_errors += 1;
+        }
+    }
+}
+
 #[allow(clippy::too_many_arguments)]
 async fn gc_ancestor(
     remote_client: &GenericRemoteStorage,
@@ -451,13 +521,100 @@ async fn gc_ancestor(
     Ok(())
 }
 
+async fn gc_tenant_manifests(
+    remote_client: &GenericRemoteStorage,
+    min_age: Duration,
+    target: &RootTarget,
+    mode: GcMode,
+    tenant_shard_id: TenantShardId,
+) -> anyhow::Result<GcSummary> {
+    let mut gc_summary = GcSummary::default();
+    match list_tenant_manifests(remote_client, tenant_shard_id, target).await? {
+        ListTenantManifestResult::WithErrors {
+            errors,
+            unknown_keys: _,
+        } => {
+            for (_key, error) in errors {
+                tracing::warn!(%tenant_shard_id, "list_tenant_manifests: {error}");
+            }
+        }
+        ListTenantManifestResult::NoErrors(mut manifest_info) => {
+            let Some(latest_gen) = manifest_info.latest_generation else {
+                return Ok(gc_summary);
+            };
+            manifest_info
+                .manifests
+                .sort_by_key(|(generation, _obj)| *generation);
+            // skip the two latest generations (they don't neccessarily have to be 1 apart from each other)
+            let candidates = manifest_info.manifests.iter().rev().skip(2);
+            for (_generation, key) in candidates {
+                maybe_delete_tenant_manifest(
+                    remote_client,
+                    &min_age,
+                    latest_gen,
+                    key,
+                    mode,
+                    &mut gc_summary,
+                )
+                .instrument(
+                    info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_gen, %key.key),
+                )
+                .await;
+            }
+        }
+    }
+    Ok(gc_summary)
+}
+
+async fn gc_timeline(
+    remote_client: &GenericRemoteStorage,
+    min_age: &Duration,
+    target: &RootTarget,
+    mode: GcMode,
+    ttid: TenantShardTimelineId,
+    accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
+) -> anyhow::Result<GcSummary> {
+    let mut summary = GcSummary::default();
+    let data = list_timeline_blobs(remote_client, ttid, target).await?;
+
+    let (index_part, latest_gen, candidates) = match &data.blob_data {
+        BlobDataParseResult::Parsed {
+            index_part,
+            index_part_generation,
+            s3_layers: _s3_layers,
+        } => (index_part, *index_part_generation, data.unused_index_keys),
+        BlobDataParseResult::Relic => {
+            // Post-deletion tenant location: don't try and GC it.
+            return Ok(summary);
+        }
+        BlobDataParseResult::Incorrect {
+            errors,
+            s3_layers: _,
+        } => {
+            // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
+            tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}");
+            return Ok(summary);
+        }
+    };
+
+    accumulator.lock().unwrap().update(ttid, index_part);
+
+    for key in candidates {
+        maybe_delete_index(remote_client, min_age, latest_gen, &key, mode, &mut summary)
+            .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, %key.key))
+            .await;
+    }
+
+    Ok(summary)
+}
+
 /// Physical garbage collection: removing unused S3 objects.
 ///
 /// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level
 /// (keys, layers).  This type of garbage collection is about removing:
 /// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between
 ///   uploading a layer and uploading an index)
-/// - Index objects from historic generations
+/// - Index objects and tenant manifests from historic generations
 ///
 /// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and
 /// make sure that object listings don't get slowed down by large numbers of garbage objects.
@@ -470,6 +627,7 @@ pub async fn pageserver_physical_gc(
 ) -> anyhow::Result<GcSummary> {
     let (remote_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
 
+    let remote_client = Arc::new(remote_client);
     let tenants = if tenant_shard_ids.is_empty() {
         futures::future::Either::Left(stream_tenants(&remote_client, &target))
     } else {
@@ -484,59 +642,59 @@ pub async fn pageserver_physical_gc(
     let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default()));
 
     // Generate a stream of TenantTimelineId
-    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t));
-    let timelines = timelines.try_buffered(CONCURRENCY);
-    let timelines = timelines.try_flatten();
-
-    // Generate a stream of S3TimelineBlobData
-    async fn gc_timeline(
-        remote_client: &GenericRemoteStorage,
-        min_age: &Duration,
-        target: &RootTarget,
-        mode: GcMode,
-        ttid: TenantShardTimelineId,
-        accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
-    ) -> anyhow::Result<GcSummary> {
-        let mut summary = GcSummary::default();
-        let data = list_timeline_blobs(remote_client, ttid, target).await?;
-
-        let (index_part, latest_gen, candidates) = match &data.blob_data {
-            BlobDataParseResult::Parsed {
-                index_part,
-                index_part_generation,
-                s3_layers: _s3_layers,
-            } => (index_part, *index_part_generation, data.unused_index_keys),
-            BlobDataParseResult::Relic => {
-                // Post-deletion tenant location: don't try and GC it.
-                return Ok(summary);
-            }
-            BlobDataParseResult::Incorrect {
-                errors,
-                s3_layers: _,
-            } => {
-                // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
-                tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}");
-                return Ok(summary);
-            }
-        };
-
-        accumulator.lock().unwrap().update(ttid, index_part);
-
-        for key in candidates {
-            maybe_delete_index(remote_client, min_age, latest_gen, &key, mode, &mut summary)
-                .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, %key.key))
-                .await;
-        }
-
-        Ok(summary)
+    enum GcSummaryOrContent<T> {
+        Content(T),
+        GcSummary(GcSummary),
     }
+    let timelines = tenants.map_ok(|tenant_shard_id| {
+        let target_ref = &target;
+        let remote_client_ref = &remote_client;
+        async move {
+            let summaries_from_manifests = match gc_tenant_manifests(
+                remote_client_ref,
+                min_age,
+                target_ref,
+                mode,
+                tenant_shard_id,
+            )
+            .await
+            {
+                Ok(gc_summary) => vec![Ok(GcSummaryOrContent::<TenantShardTimelineId>::GcSummary(
+                    gc_summary,
+                ))],
+                Err(e) => {
+                    tracing::warn!(%tenant_shard_id, "Error in gc_tenant_manifests: {e}");
+                    Vec::new()
+                }
+            };
+            stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id)
+                .await
+                .map(|stream| {
+                    stream
+                        .map_ok(GcSummaryOrContent::Content)
+                        .chain(futures::stream::iter(summaries_from_manifests.into_iter()))
+                })
+        }
+    });
+    let timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
+    let timelines = timelines.try_flatten();
 
     let mut summary = GcSummary::default();
 
     // Drain futures for per-shard GC, populating accumulator as a side effect
     {
-        let timelines = timelines.map_ok(|ttid| {
-            gc_timeline(&remote_client, &min_age, &target, mode, ttid, &accumulator)
+        let timelines = timelines.map_ok(|summary_or_ttid| match summary_or_ttid {
+            GcSummaryOrContent::Content(ttid) => futures::future::Either::Left(gc_timeline(
+                &remote_client,
+                &min_age,
+                &target,
+                mode,
+                ttid,
+                &accumulator,
+            )),
+            GcSummaryOrContent::GcSummary(gc_summary) => {
+                futures::future::Either::Right(futures::future::ok(gc_summary))
+            }
         });
         let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
 
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 5a1e493bbe..e808dd1396 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -835,3 +835,117 @@ def test_timeline_retain_lsn(
         with env.endpoints.create_start("test_archived_branch", tenant_id=tenant_id) as endpoint:
             sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
             assert sum == pre_branch_sum
+
+
+def test_timeline_offload_generations(neon_env_builder: NeonEnvBuilder):
+    """
+    Test for scrubber deleting old generations of manifests
+    """
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+
+    # Turn off gc and compaction loops: we want to issue them manually for better reliability
+    tenant_id, root_timeline_id = env.create_tenant(
+        conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            "checkpoint_distance": f"{1024 ** 2}",
+        }
+    )
+
+    # Create a branch and archive it
+    child_timeline_id = env.create_branch("test_archived_branch_persisted", tenant_id)
+
+    with env.endpoints.create_start(
+        "test_archived_branch_persisted", tenant_id=tenant_id
+    ) as endpoint:
+        endpoint.safe_psql_many(
+            [
+                "CREATE TABLE foo(key serial primary key, t text default 'data_content')",
+                "INSERT INTO foo SELECT FROM generate_series(1,512)",
+            ]
+        )
+        sum = endpoint.safe_psql("SELECT sum(key) from foo where key % 3 = 2")
+        last_flush_lsn_upload(env, endpoint, tenant_id, child_timeline_id)
+
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=f"tenants/{str(tenant_id)}/",
+    )
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=f"tenants/{str(tenant_id)}/tenant-manifest",
+    )
+
+    ps_http.timeline_archival_config(
+        tenant_id,
+        child_timeline_id,
+        state=TimelineArchivalState.ARCHIVED,
+    )
+
+    def timeline_offloaded_api(timeline_id: TimelineId) -> bool:
+        # TODO add a proper API to check if a timeline has been offloaded or not
+        return not any(
+            timeline["timeline_id"] == str(timeline_id)
+            for timeline in ps_http.timeline_list(tenant_id=tenant_id)
+        )
+
+    def child_offloaded():
+        ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id)
+        assert timeline_offloaded_api(child_timeline_id)
+
+    wait_until(child_offloaded)
+
+    assert timeline_offloaded_api(child_timeline_id)
+    assert not timeline_offloaded_api(root_timeline_id)
+
+    # Reboot the pageserver a bunch of times, do unoffloads, offloads
+    for i in range(5):
+        env.pageserver.stop()
+        env.pageserver.start()
+
+        assert timeline_offloaded_api(child_timeline_id)
+        assert not timeline_offloaded_api(root_timeline_id)
+
+        ps_http.timeline_archival_config(
+            tenant_id,
+            child_timeline_id,
+            state=TimelineArchivalState.UNARCHIVED,
+        )
+
+        assert not timeline_offloaded_api(child_timeline_id)
+
+        if i % 2 == 0:
+            with env.endpoints.create_start(
+                "test_archived_branch_persisted", tenant_id=tenant_id
+            ) as endpoint:
+                sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key % 3 = 2")
+                assert sum == sum_again
+
+        ps_http.timeline_archival_config(
+            tenant_id,
+            child_timeline_id,
+            state=TimelineArchivalState.ARCHIVED,
+        )
+        wait_until(child_offloaded)
+
+    #
+    # Now ensure that scrubber runs will clean up old generations' manifests.
+    #
+
+    # Sleep some amount larger than min_age_secs
+    time.sleep(3)
+
+    # Ensure that min_age_secs has a deletion impeding effect
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600, mode="full")
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+    assert gc_summary["tenant_manifests_deleted"] == 0
+
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full")
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] > 0
+    assert gc_summary["tenant_manifests_deleted"] > 0

From c719be64741a6459293d68f42d2b8fb783985948 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 3 Dec 2024 23:07:03 +0100
Subject: [PATCH 178/209] tests & benchmarks: unify the way we customize the
 default tenant config (#9992)

Before this PR, some override callbacks used `.default()`, others
used `.setdefault()`.

As of this PR, all callbacks use `.setdefault()` which I think is least
prone to failure.

Aligning on a single way will set the right example for future tests
that need such customization.

The `test_pageserver_getpage_throttle.py` technically is a change in
behavior: before, it replaced the `tenant_config` field, now it just
configures the throttle. This is what I believe is intended anyway.
---
 test_runner/performance/test_branch_creation.py         | 3 +--
 test_runner/regress/test_disk_usage_eviction.py         | 3 +--
 test_runner/regress/test_pageserver_getpage_throttle.py | 3 ++-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index 3ce27d6cd3..cf2212d447 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -142,10 +142,9 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
     # start without gc so we can time compaction with less noise; use shorter
     # period for compaction so it starts earlier
     def patch_default_tenant_config(config):
-        tenant_config = config.get("tenant_config", {})
+        tenant_config = config.setdefault("tenant_config", {})
         tenant_config["compaction_period"] = "3s"
         tenant_config["gc_period"] = "0s"
-        config["tenant_config"] = tenant_config
 
     env.pageserver.edit_config_toml(patch_default_tenant_config)
     env.pageserver.start(
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 05956b5b93..954db914b9 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -62,9 +62,8 @@ def test_min_resident_size_override_handling(
     if config_level_override is not None:
 
         def set_min_resident_size(config):
-            tenant_config = config.get("tenant_config", {})
+            tenant_config = config.setdefault("tenant_config", {})
             tenant_config["min_resident_size_override"] = config_level_override
-            config["tenant_config"] = tenant_config
 
         env.pageserver.edit_config_toml(set_min_resident_size)
     env.pageserver.stop()
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index 6d0661f068..9644ebe3e2 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -183,7 +183,8 @@ def test_throttle_fair_config_is_settable_but_ignored_in_config_toml(
     """
 
     def set_tenant_config(ps_cfg):
-        ps_cfg["tenant_config"] = {"timeline_get_throttle": throttle_config_with_field_fair_set}
+        tenant_config = ps_cfg.setdefault("tenant_config", {})
+        tenant_config["timeline_get_throttle"] = throttle_config_with_field_fair_set
 
     neon_env_builder.pageserver_config_override = set_tenant_config
     env = neon_env_builder.init_start()

From 73ccc2b08cb6ca69370733dd4eba520414943dc7 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 3 Dec 2024 22:46:18 +0000
Subject: [PATCH 179/209] test_page_service_batching: fix non-numeric metrics
 (#9998)

## Problem

```
2024-12-03T15:42:46.5978335Z + poetry run python /__w/neon/neon/scripts/ingest_perf_test_result.py --ingest /__w/neon/neon/test_runner/perf-report-local
2024-12-03T15:42:49.5325077Z Traceback (most recent call last):
2024-12-03T15:42:49.5325603Z   File "/__w/neon/neon/scripts/ingest_perf_test_result.py", line 165, in <module>
2024-12-03T15:42:49.5326029Z     main()
2024-12-03T15:42:49.5326316Z   File "/__w/neon/neon/scripts/ingest_perf_test_result.py", line 155, in main
2024-12-03T15:42:49.5326739Z     ingested = ingest_perf_test_result(cur, item, recorded_at_timestamp)
2024-12-03T15:42:49.5327488Z                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2024-12-03T15:42:49.5327914Z   File "/__w/neon/neon/scripts/ingest_perf_test_result.py", line 99, in ingest_perf_test_result
2024-12-03T15:42:49.5328321Z     psycopg2.extras.execute_values(
2024-12-03T15:42:49.5328940Z   File "/github/home/.cache/pypoetry/virtualenvs/non-package-mode-_pxWMzVK-py3.11/lib/python3.11/site-packages/psycopg2/extras.py", line 1299, in execute_values
2024-12-03T15:42:49.5335618Z     cur.execute(b''.join(parts))
2024-12-03T15:42:49.5335967Z psycopg2.errors.InvalidTextRepresentation: invalid input syntax for type numeric: "concurrent-futures"
2024-12-03T15:42:49.5336287Z LINE 57:             'concurrent-futures',
2024-12-03T15:42:49.5336462Z                      ^
```

## Summary of changes
- `test_page_service_batching`: save non-numeric params as `labels`
- Add a runtime check that `metric_value` is NUMERIC
---
 test_runner/fixtures/benchmark_fixture.py             | 10 ++++++++++
 .../pageserver/test_page_service_batching.py          | 11 ++++-------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index bb8e75902e..fa3747c08f 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -266,6 +266,16 @@ class NeonBenchmarker:
         name = f"{self.PROPERTY_PREFIX}_{metric_name}"
         if labels is None:
             labels = {}
+
+        # Sometimes mypy can't catch non-numeric values,
+        # so adding a check here
+        try:
+            float(metric_value)
+        except ValueError as e:
+            raise ValueError(
+                f"`metric_value` (`{metric_value}`) must be a NUMERIC-friendly data type"
+            ) from e
+
         self.property_recorder(
             name,
             {
diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py
index 562094a059..2c27368001 100644
--- a/test_runner/performance/pageserver/test_page_service_batching.py
+++ b/test_runner/performance/pageserver/test_page_service_batching.py
@@ -116,21 +116,18 @@ def test_throughput(
             # name is not a metric, we just use it to identify the test easily in the `test_...[...]`` notation
         }
     )
-    params.update(
-        {
-            f"pipelining_config.{k}": (v, {})
-            for k, v in dataclasses.asdict(pipelining_config).items()
-        }
-    )
+    # For storing configuration as a metric, insert a fake 0 with labels with actual data
+    params.update({"pipelining_config": (0, {"labels": dataclasses.asdict(pipelining_config)})})
 
     log.info("params: %s", params)
 
     for param, (value, kwargs) in params.items():
         zenbenchmark.record(
             param,
-            metric_value=value,
+            metric_value=float(value),
             unit=kwargs.pop("unit", ""),
             report=MetricReport.TEST_PARAM,
+            labels=kwargs.pop("labels", None),
             **kwargs,
         )
 

From 16163fb8508a4383410d5f8589961fce6c4e3aa2 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 4 Dec 2024 01:07:49 +0100
Subject: [PATCH 180/209] page_service: enable batching in Rust & Python Tests
 + Python benchmarks (#9993)

This is the first step towards batching rollout.

Refs

- rollout plan: https://github.com/neondatabase/cloud/issues/20620
- task https://github.com/neondatabase/neon/issues/9377
- uber-epic: https://github.com/neondatabase/neon/issues/9376
---
 libs/pageserver_api/src/config.rs     |  9 ++++++++-
 test_runner/fixtures/neon_fixtures.py | 11 +++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index e49d15ba87..09cfbc55fd 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -442,7 +442,14 @@ impl Default for ConfigToml {
             tenant_config: TenantConfigToml::default(),
             no_sync: None,
             wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
-            page_service_pipelining: PageServicePipeliningConfig::Serial,
+            page_service_pipelining: if !cfg!(test) {
+                PageServicePipeliningConfig::Serial
+            } else {
+                PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
+                    max_batch_size: NonZeroUsize::new(32).unwrap(),
+                    execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
+                })
+            },
         }
     }
 }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index f55f06bebc..9c579373e8 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1095,6 +1095,17 @@ class NeonEnv:
                 # the pageserver taking a long time to start up due to syncfs flushing other tests' data
                 "no_sync": True,
             }
+
+            # Batching (https://github.com/neondatabase/neon/issues/9377):
+            # enable batching by default in tests and benchmarks.
+            # Compat tests are exempt because old versions fail to parse the new config.
+            if not config.compatibility_neon_binpath:
+                ps_cfg["page_service_pipelining"] = {
+                    "mode": "pipelined",
+                    "execution": "concurrent-futures",
+                    "max_batch_size": 32,
+                }
+
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
             if config.pageserver_default_tenant_config_compaction_algorithm is not None:

From 1c9bbf1a92ec89fa6cff269cdeaf2c89408e1a35 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 4 Dec 2024 09:25:29 +0000
Subject: [PATCH 181/209] storcon: return an error for drain attempts while
 paused (#9997)

## Problem

We currently allow drain operations to proceed while the node policy is
paused.

## Summary of changes

Return a precondition failed error in such cases. The orchestrator is
updated in https://github.com/neondatabase/infra/pull/2544 to skip drain
and fills if the pageserver is paused.

Closes: https://github.com/neondatabase/neon/issues/9907
---
 storage_controller/src/service.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 741d3dc2b4..92ec58cb4d 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5681,7 +5681,7 @@ impl Service {
         }
 
         match node_policy {
-            NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Pause => {
+            NodeSchedulingPolicy::Active => {
                 self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Draining))
                     .await?;
 

From 709b8cd37180e81919a2ac7159e17c7a15becdc2 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 4 Dec 2024 12:07:22 +0100
Subject: [PATCH 182/209] optimize parms for ingest bench (#9999)

## Problem

we tried different parallelism settings for ingest bench

## Summary of changes

the following settings seem optimal after merging
- SK side Wal filtering
- batched getpages

Settings:
- effective_io_concurrency 100
- concurrency limit 200 (different from Prod!)
- jobs 4, maintenance workers 7
- 10 GB chunk size
---
 .github/workflows/ingest_benchmark.yml                 |  1 +
 .../performance/test_perf_ingest_using_pgcopydb.py     | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml
index 1033dc6489..a5810e91a4 100644
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -26,6 +26,7 @@ concurrency:
 jobs:
   ingest:
     strategy:
+      fail-fast: false # allow other variants to continue even if one fails
       matrix:
         target_project: [new_empty_project, large_existing_project]  
     permissions:
diff --git a/test_runner/performance/test_perf_ingest_using_pgcopydb.py b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
index 37f2e9db50..2f4574ba88 100644
--- a/test_runner/performance/test_perf_ingest_using_pgcopydb.py
+++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
@@ -60,13 +60,13 @@ def build_pgcopydb_command(pgcopydb_filter_file: Path, test_output_dir: Path):
         "--no-acl",
         "--skip-db-properties",
         "--table-jobs",
-        "8",
+        "4",
         "--index-jobs",
-        "8",
+        "4",
         "--restore-jobs",
-        "8",
+        "4",
         "--split-tables-larger-than",
-        "5GB",
+        "10GB",
         "--skip-extensions",
         "--use-copy-binary",
         "--filters",
@@ -136,7 +136,7 @@ def run_command_and_log_output(command, log_file_path: Path):
         "LD_LIBRARY_PATH": f"{os.getenv('PGCOPYDB_LIB_PATH')}:{os.getenv('PG_16_LIB_PATH')}",
         "PGCOPYDB_SOURCE_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")),
         "PGCOPYDB_TARGET_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")),
-        "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=16",
+        "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7",
     }
     # Combine the current environment with custom variables
     env = os.environ.copy()

From 33790d14a320cb5e2fd8b1cd6ba2886b79b67127 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 4 Dec 2024 12:37:24 +0100
Subject: [PATCH 183/209] fix parsing human time output like "50m37s" (#10001)

## Problem

In ingest_benchmark.yml workflow we use pgcopydb tool to migrate
project.
pgcopydb logs human time.

Our parsing of the human time doesn't work for times like "50m37s".

[Example
workflow](https://github.com/neondatabase/neon/actions/runs/12145539948/job/33867418065#step:10:479)

contains "57m45s"

but we
[reported](https://github.com/neondatabase/neon/actions/runs/12145539948/job/33867418065#step:10:500)
only the seconds part:
45.000 s


## Summary of changes

add a regex pattern for Minute/Second combination
---
 test_runner/performance/test_perf_ingest_using_pgcopydb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/performance/test_perf_ingest_using_pgcopydb.py b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
index 2f4574ba88..f0a0c1f5a2 100644
--- a/test_runner/performance/test_perf_ingest_using_pgcopydb.py
+++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
@@ -184,7 +184,7 @@ def parse_log_and_report_metrics(
             for metric_name, pattern in metric_patterns.items():
                 if pattern.search(line):
                     # Extract duration and convert it to seconds
-                    duration_match = re.search(r"\d+h\d+m|\d+s|\d+ms|\d+\.\d+s", line)
+                    duration_match = re.search(r"\d+h\d+m|\d+m\d+s|\d+s|\d+ms|\d+\.\d+s", line)
                     if duration_match:
                         duration_str = duration_match.group(0)
                         parts = re.findall(r"\d+[a-zA-Z]+", duration_str)

From 13285c2a5e5ad450775d9f40c0fee3a6c62da61f Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 4 Dec 2024 13:53:52 +0100
Subject: [PATCH 184/209] pageserver: return proper status code for
 heatmap_upload errors (#9991)

## Problem

During deploys, we see a lot of 500 errors due to heapmap uploads for
inactive tenants. These should be 503s instead.

Resolves #9574.

## Summary of changes

Make the secondary tenant scheduler use `ApiError` rather than
`anyhow::Error`, to propagate the tenant error and convert it to an
appropriate status code.
---
 pageserver/src/http/routes.rs                 | 28 ++++++++++++----
 pageserver/src/tenant.rs                      |  2 +-
 pageserver/src/tenant/mgr.rs                  |  5 ++-
 pageserver/src/tenant/secondary.rs            | 33 +++++++++++++++----
 pageserver/src/tenant/secondary/downloader.rs | 13 ++++----
 .../src/tenant/secondary/heatmap_uploader.rs  | 10 +++---
 pageserver/src/tenant/secondary/scheduler.rs  |  4 +--
 7 files changed, 68 insertions(+), 27 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index e127871549..e04f1460a8 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -279,7 +279,10 @@ impl From<TenantStateError> for ApiError {
 impl From<GetTenantError> for ApiError {
     fn from(tse: GetTenantError) -> ApiError {
         match tse {
-            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
+            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {tid}").into()),
+            GetTenantError::ShardNotFound(tid) => {
+                ApiError::NotFound(anyhow!("tenant {tid}").into())
+            }
             GetTenantError::NotActive(_) => {
                 // Why is this not `ApiError::NotFound`?
                 // Because we must be careful to never return 404 for a tenant if it does
@@ -387,6 +390,16 @@ impl From<crate::tenant::mgr::DeleteTenantError> for ApiError {
     }
 }
 
+impl From<crate::tenant::secondary::SecondaryTenantError> for ApiError {
+    fn from(ste: crate::tenant::secondary::SecondaryTenantError) -> ApiError {
+        use crate::tenant::secondary::SecondaryTenantError;
+        match ste {
+            SecondaryTenantError::GetTenant(gte) => gte.into(),
+            SecondaryTenantError::ShuttingDown => ApiError::ShuttingDown,
+        }
+    }
+}
+
 // Helper function to construct a TimelineInfo struct for a timeline
 async fn build_timeline_info(
     timeline: &Arc<Timeline>,
@@ -1047,9 +1060,11 @@ async fn timeline_delete_handler(
             match e {
                 // GetTenantError has a built-in conversion to ApiError, but in this context we don't
                 // want to treat missing tenants as 404, to avoid ambiguity with successful deletions.
-                GetTenantError::NotFound(_) => ApiError::PreconditionFailed(
-                    "Requested tenant is missing".to_string().into_boxed_str(),
-                ),
+                GetTenantError::NotFound(_) | GetTenantError::ShardNotFound(_) => {
+                    ApiError::PreconditionFailed(
+                        "Requested tenant is missing".to_string().into_boxed_str(),
+                    )
+                }
                 e => e.into(),
             }
         })?;
@@ -2462,8 +2477,7 @@ async fn secondary_upload_handler(
     state
         .secondary_controller
         .upload_tenant(tenant_shard_id)
-        .await
-        .map_err(ApiError::InternalServerError)?;
+        .await?;
 
     json_response(StatusCode::OK, ())
 }
@@ -2578,7 +2592,7 @@ async fn secondary_download_handler(
         // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
         // okay.  We could get an error here in the unlikely edge case that the tenant
         // was detached between our check above and executing the download job.
-        Ok(Err(e)) => return Err(ApiError::InternalServerError(e)),
+        Ok(Err(e)) => return Err(e.into()),
         // A timeout is not an error: we have started the download, we're just not done
         // yet.  The caller will get a response body indicating status.
         Err(_) => StatusCode::ACCEPTED,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ada5c4a977..5a9e398586 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3422,7 +3422,7 @@ impl Tenant {
                             r.map_err(
                             |_e: tokio::sync::watch::error::RecvError|
                                 // Tenant existed but was dropped: report it as non-existent
-                                GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
+                                GetActiveTenantError::NotFound(GetTenantError::ShardNotFound(self.tenant_shard_id))
                         )?
                         }
                         Err(TimeoutCancellableError::Cancelled) => {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 45481c4ed4..e8b0d1d4dd 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -894,7 +894,7 @@ impl TenantManager {
             Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)),
             Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
             None | Some(TenantSlot::Secondary(_)) => {
-                Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
+                Err(GetTenantError::ShardNotFound(tenant_shard_id))
             }
         }
     }
@@ -2258,6 +2258,9 @@ pub(crate) enum GetTenantError {
     #[error("Tenant {0} not found")]
     NotFound(TenantId),
 
+    #[error("Tenant {0} not found")]
+    ShardNotFound(TenantShardId),
+
     #[error("Tenant {0} is not active")]
     NotActive(TenantShardId),
 
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 3df89a928c..4bc208331b 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -22,6 +22,7 @@ use super::{
     mgr::TenantManager,
     span::debug_assert_current_span_has_tenant_id,
     storage_layer::LayerName,
+    GetTenantError,
 };
 
 use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE;
@@ -66,7 +67,21 @@ struct CommandRequest<T> {
 }
 
 struct CommandResponse {
-    result: anyhow::Result<()>,
+    result: Result<(), SecondaryTenantError>,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum SecondaryTenantError {
+    #[error("{0}")]
+    GetTenant(GetTenantError),
+    #[error("shutting down")]
+    ShuttingDown,
+}
+
+impl From<GetTenantError> for SecondaryTenantError {
+    fn from(gte: GetTenantError) -> Self {
+        Self::GetTenant(gte)
+    }
 }
 
 // Whereas [`Tenant`] represents an attached tenant, this type represents the work
@@ -285,7 +300,7 @@ impl SecondaryController {
         &self,
         queue: &tokio::sync::mpsc::Sender<CommandRequest<T>>,
         payload: T,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), SecondaryTenantError> {
         let (response_tx, response_rx) = tokio::sync::oneshot::channel();
 
         queue
@@ -294,20 +309,26 @@ impl SecondaryController {
                 response_tx,
             })
             .await
-            .map_err(|_| anyhow::anyhow!("Receiver shut down"))?;
+            .map_err(|_| SecondaryTenantError::ShuttingDown)?;
 
         let response = response_rx
             .await
-            .map_err(|_| anyhow::anyhow!("Request dropped"))?;
+            .map_err(|_| SecondaryTenantError::ShuttingDown)?;
 
         response.result
     }
 
-    pub async fn upload_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+    pub(crate) async fn upload_tenant(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<(), SecondaryTenantError> {
         self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
             .await
     }
-    pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+    pub(crate) async fn download_tenant(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<(), SecondaryTenantError> {
         self.dispatch(
             &self.download_req_tx,
             DownloadCommand::Download(tenant_shard_id),
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 8d771dc405..701e4cf04b 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -35,7 +35,7 @@ use super::{
         self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult,
         TenantBackgroundJobs,
     },
-    SecondaryTenant,
+    GetTenantError, SecondaryTenant, SecondaryTenantError,
 };
 
 use crate::tenant::{
@@ -470,15 +470,16 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
         result
     }
 
-    fn on_command(&mut self, command: DownloadCommand) -> anyhow::Result<PendingDownload> {
+    fn on_command(
+        &mut self,
+        command: DownloadCommand,
+    ) -> Result<PendingDownload, SecondaryTenantError> {
         let tenant_shard_id = command.get_tenant_shard_id();
 
         let tenant = self
             .tenant_manager
-            .get_secondary_tenant_shard(*tenant_shard_id);
-        let Some(tenant) = tenant else {
-            return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
-        };
+            .get_secondary_tenant_shard(*tenant_shard_id)
+            .ok_or(GetTenantError::ShardNotFound(*tenant_shard_id))?;
 
         Ok(PendingDownload {
             target_time: None,
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index e680fd705b..c5e5e04945 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -28,7 +28,7 @@ use super::{
         self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult,
         TenantBackgroundJobs,
     },
-    CommandRequest, UploadCommand,
+    CommandRequest, SecondaryTenantError, UploadCommand,
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, Instrument};
@@ -279,7 +279,10 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
         }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
     }
 
-    fn on_command(&mut self, command: UploadCommand) -> anyhow::Result<UploadPending> {
+    fn on_command(
+        &mut self,
+        command: UploadCommand,
+    ) -> Result<UploadPending, SecondaryTenantError> {
         let tenant_shard_id = command.get_tenant_shard_id();
 
         tracing::info!(
@@ -287,8 +290,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
             "Starting heatmap write on command");
         let tenant = self
             .tenant_manager
-            .get_attached_tenant_shard(*tenant_shard_id)
-            .map_err(|e| anyhow::anyhow!(e))?;
+            .get_attached_tenant_shard(*tenant_shard_id)?;
         if !tenant.is_active() {
             return Err(GetTenantError::NotActive(*tenant_shard_id).into());
         }
diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs
index 28cf2125df..e963c722b9 100644
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -12,7 +12,7 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use utils::{completion::Barrier, yielding_loop::yielding_loop};
 
-use super::{CommandRequest, CommandResponse};
+use super::{CommandRequest, CommandResponse, SecondaryTenantError};
 
 /// Scheduling interval is the time between calls to JobGenerator::schedule.
 /// When we schedule jobs, the job generator may provide a hint of its preferred
@@ -112,7 +112,7 @@ where
 
     /// Called when a command is received.  A job will be spawned immediately if the return
     /// value is Some, ignoring concurrency limits and the pending queue.
-    fn on_command(&mut self, cmd: CMD) -> anyhow::Result<PJ>;
+    fn on_command(&mut self, cmd: CMD) -> Result<PJ, SecondaryTenantError>;
 }
 
 /// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling

From 6359342ffbc5b712350491f4f89e08d18626c786 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 4 Dec 2024 13:58:31 +0100
Subject: [PATCH 185/209] Assign /libs/proxy/ to proxy team (#10003)

---
 CODEOWNERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CODEOWNERS b/CODEOWNERS
index 21b0e7c51f..f41462c98b 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -2,6 +2,7 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
 /libs/pageserver_api/ @neondatabase/storage
 /libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
+/libs/proxy/ @neondatabase/proxy
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/storage
 /libs/vm_monitor/ @neondatabase/autoscaling

From cab498c7876db989c9bacb62cc94fe29c10b143a Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 4 Dec 2024 12:58:35 +0000
Subject: [PATCH 186/209] feat(proxy): add option to forward startup params
 (#9979)

(stacked on #9990 and #9995)

Partially fixes #1287 with a custom option field to enable the fixed
behaviour. This allows us to gradually roll out the fix without silently
changing the observed behaviour for our customers.

related to https://github.com/neondatabase/cloud/issues/15284
---
 Cargo.lock                                    |   4 +-
 Cargo.toml                                    |   2 +-
 libs/pq_proto/src/lib.rs                      |   2 +-
 .../src/authentication/sasl.rs                |   4 +-
 .../src/message/frontend.rs                   |  28 ++-
 libs/proxy/tokio-postgres2/src/codec.rs       |  13 +-
 libs/proxy/tokio-postgres2/src/config.rs      | 193 +++---------------
 libs/proxy/tokio-postgres2/src/connect.rs     |  49 +----
 libs/proxy/tokio-postgres2/src/connect_raw.rs |  31 +--
 proxy/src/cancellation.rs                     |  11 +-
 proxy/src/compute.rs                          |  96 ++++-----
 proxy/src/console_redirect_proxy.rs           |   1 +
 proxy/src/proxy/connect_compute.rs            |   4 +-
 proxy/src/proxy/mod.rs                        |  40 +++-
 proxy/src/proxy/tests/mitm.rs                 |   8 +-
 proxy/src/proxy/tests/mod.rs                  |   2 +-
 proxy/src/serverless/backend.rs               |  11 +-
 test_runner/fixtures/neon_fixtures.py         |   2 +-
 test_runner/regress/test_proxy.py             |  19 ++
 19 files changed, 180 insertions(+), 340 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5b80ec5e93..38158b7aec 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1031,9 +1031,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
 
 [[package]]
 name = "bytes"
-version = "1.5.0"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
+checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b"
 dependencies = [
  "serde",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 91fa6a2607..a35823e0c2 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -74,7 +74,7 @@ bindgen = "0.70"
 bit_field = "0.10.2"
 bstr = "1.0"
 byteorder = "1.4"
-bytes = "1.0"
+bytes = "1.9"
 camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 43dfbc22a4..94714359a3 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -100,7 +100,7 @@ impl StartupMessageParamsBuilder {
 
 #[derive(Debug, Clone, Default)]
 pub struct StartupMessageParams {
-    params: Bytes,
+    pub params: Bytes,
 }
 
 impl StartupMessageParams {
diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
index 19aa3c1e9a..f2200a40ce 100644
--- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
+++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
@@ -117,7 +117,7 @@ enum Credentials<const N: usize> {
     /// A regular password as a vector of bytes.
     Password(Vec<u8>),
     /// A precomputed pair of keys.
-    Keys(Box<ScramKeys<N>>),
+    Keys(ScramKeys<N>),
 }
 
 enum State {
@@ -176,7 +176,7 @@ impl ScramSha256 {
 
     /// Constructs a new instance which will use the provided key pair for authentication.
     pub fn new_with_keys(keys: ScramKeys<32>, channel_binding: ChannelBinding) -> ScramSha256 {
-        let password = Credentials::Keys(keys.into());
+        let password = Credentials::Keys(keys);
         ScramSha256::new_inner(password, channel_binding, nonce())
     }
 
diff --git a/libs/proxy/postgres-protocol2/src/message/frontend.rs b/libs/proxy/postgres-protocol2/src/message/frontend.rs
index 5d0a8ff8c8..bc6168f337 100644
--- a/libs/proxy/postgres-protocol2/src/message/frontend.rs
+++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs
@@ -255,22 +255,34 @@ pub fn ssl_request(buf: &mut BytesMut) {
 }
 
 #[inline]
-pub fn startup_message<'a, I>(parameters: I, buf: &mut BytesMut) -> io::Result<()>
-where
-    I: IntoIterator<Item = (&'a str, &'a str)>,
-{
+pub fn startup_message(parameters: &StartupMessageParams, buf: &mut BytesMut) -> io::Result<()> {
     write_body(buf, |buf| {
         // postgres protocol version 3.0(196608) in bigger-endian
         buf.put_i32(0x00_03_00_00);
-        for (key, value) in parameters {
-            write_cstr(key.as_bytes(), buf)?;
-            write_cstr(value.as_bytes(), buf)?;
-        }
+        buf.put_slice(&parameters.params);
         buf.put_u8(0);
         Ok(())
     })
 }
 
+#[derive(Debug, Clone, Default, PartialEq, Eq)]
+pub struct StartupMessageParams {
+    pub params: BytesMut,
+}
+
+impl StartupMessageParams {
+    /// Set parameter's value by its name.
+    pub fn insert(&mut self, name: &str, value: &str) {
+        if name.contains('\0') || value.contains('\0') {
+            panic!("startup parameter name or value contained a null")
+        }
+        self.params.put_slice(name.as_bytes());
+        self.params.put_u8(0);
+        self.params.put_slice(value.as_bytes());
+        self.params.put_u8(0);
+    }
+}
+
 #[inline]
 pub fn sync(buf: &mut BytesMut) {
     buf.put_u8(b'S');
diff --git a/libs/proxy/tokio-postgres2/src/codec.rs b/libs/proxy/tokio-postgres2/src/codec.rs
index 7412db785b..0ec46198ce 100644
--- a/libs/proxy/tokio-postgres2/src/codec.rs
+++ b/libs/proxy/tokio-postgres2/src/codec.rs
@@ -35,9 +35,7 @@ impl FallibleIterator for BackendMessages {
     }
 }
 
-pub struct PostgresCodec {
-    pub max_message_size: Option<usize>,
-}
+pub struct PostgresCodec;
 
 impl Encoder<FrontendMessage> for PostgresCodec {
     type Error = io::Error;
@@ -66,15 +64,6 @@ impl Decoder for PostgresCodec {
                 break;
             }
 
-            if let Some(max) = self.max_message_size {
-                if len > max {
-                    return Err(io::Error::new(
-                        io::ErrorKind::InvalidInput,
-                        "message too large",
-                    ));
-                }
-            }
-
             match header.tag() {
                 backend::NOTICE_RESPONSE_TAG
                 | backend::NOTIFICATION_RESPONSE_TAG
diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
index fd10ef6f20..11a361a81b 100644
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -6,6 +6,7 @@ use crate::connect_raw::RawConnection;
 use crate::tls::MakeTlsConnect;
 use crate::tls::TlsConnect;
 use crate::{Client, Connection, Error};
+use postgres_protocol2::message::frontend::StartupMessageParams;
 use std::fmt;
 use std::str;
 use std::time::Duration;
@@ -14,16 +15,6 @@ use tokio::io::{AsyncRead, AsyncWrite};
 pub use postgres_protocol2::authentication::sasl::ScramKeys;
 use tokio::net::TcpStream;
 
-/// Properties required of a session.
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
-#[non_exhaustive]
-pub enum TargetSessionAttrs {
-    /// No special properties are required.
-    Any,
-    /// The session must allow writes.
-    ReadWrite,
-}
-
 /// TLS configuration.
 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
 #[non_exhaustive]
@@ -73,94 +64,20 @@ pub enum AuthKeys {
 }
 
 /// Connection configuration.
-///
-/// Configuration can be parsed from libpq-style connection strings. These strings come in two formats:
-///
-/// # Key-Value
-///
-/// This format consists of space-separated key-value pairs. Values which are either the empty string or contain
-/// whitespace should be wrapped in `'`. `'` and `\` characters should be backslash-escaped.
-///
-/// ## Keys
-///
-/// * `user` - The username to authenticate with. Required.
-/// * `password` - The password to authenticate with.
-/// * `dbname` - The name of the database to connect to. Defaults to the username.
-/// * `options` - Command line options used to configure the server.
-/// * `application_name` - Sets the `application_name` parameter on the server.
-/// * `sslmode` - Controls usage of TLS. If set to `disable`, TLS will not be used. If set to `prefer`, TLS will be used
-///     if available, but not used otherwise. If set to `require`, TLS will be forced to be used. Defaults to `prefer`.
-/// * `host` - The host to connect to. On Unix platforms, if the host starts with a `/` character it is treated as the
-///     path to the directory containing Unix domain sockets. Otherwise, it is treated as a hostname. Multiple hosts
-///     can be specified, separated by commas. Each host will be tried in turn when connecting. Required if connecting
-///     with the `connect` method.
-/// * `port` - The port to connect to. Multiple ports can be specified, separated by commas. The number of ports must be
-///     either 1, in which case it will be used for all hosts, or the same as the number of hosts. Defaults to 5432 if
-///     omitted or the empty string.
-/// * `connect_timeout` - The time limit in seconds applied to each socket-level connection attempt. Note that hostnames
-///     can resolve to multiple IP addresses, and this limit is applied to each address. Defaults to no timeout.
-/// * `target_session_attrs` - Specifies requirements of the session. If set to `read-write`, the client will check that
-///     the `transaction_read_write` session parameter is set to `on`. This can be used to connect to the primary server
-///     in a database cluster as opposed to the secondary read-only mirrors. Defaults to `all`.
-/// * `channel_binding` - Controls usage of channel binding in the authentication process. If set to `disable`, channel
-///     binding will not be used. If set to `prefer`, channel binding will be used if available, but not used otherwise.
-///     If set to `require`, the authentication process will fail if channel binding is not used. Defaults to `prefer`.
-///
-/// ## Examples
-///
-/// ```not_rust
-/// host=localhost user=postgres connect_timeout=10 keepalives=0
-/// ```
-///
-/// ```not_rust
-/// host=/var/lib/postgresql,localhost port=1234 user=postgres password='password with spaces'
-/// ```
-///
-/// ```not_rust
-/// host=host1,host2,host3 port=1234,,5678 user=postgres target_session_attrs=read-write
-/// ```
-///
-/// # Url
-///
-/// This format resembles a URL with a scheme of either `postgres://` or `postgresql://`. All components are optional,
-/// and the format accepts query parameters for all of the key-value pairs described in the section above. Multiple
-/// host/port pairs can be comma-separated. Unix socket paths in the host section of the URL should be percent-encoded,
-/// as the path component of the URL specifies the database name.
-///
-/// ## Examples
-///
-/// ```not_rust
-/// postgresql://user@localhost
-/// ```
-///
-/// ```not_rust
-/// postgresql://user:password@%2Fvar%2Flib%2Fpostgresql/mydb?connect_timeout=10
-/// ```
-///
-/// ```not_rust
-/// postgresql://user@host1:1234,host2,host3:5678?target_session_attrs=read-write
-/// ```
-///
-/// ```not_rust
-/// postgresql:///mydb?user=user&host=/var/lib/postgresql
-/// ```
 #[derive(Clone, PartialEq, Eq)]
 pub struct Config {
     pub(crate) host: Host,
     pub(crate) port: u16,
 
-    pub(crate) user: Option<String>,
     pub(crate) password: Option<Vec<u8>>,
     pub(crate) auth_keys: Option<Box<AuthKeys>>,
-    pub(crate) dbname: Option<String>,
-    pub(crate) options: Option<String>,
-    pub(crate) application_name: Option<String>,
     pub(crate) ssl_mode: SslMode,
     pub(crate) connect_timeout: Option<Duration>,
-    pub(crate) target_session_attrs: TargetSessionAttrs,
     pub(crate) channel_binding: ChannelBinding,
-    pub(crate) replication_mode: Option<ReplicationMode>,
-    pub(crate) max_backend_message_size: Option<usize>,
+    pub(crate) server_params: StartupMessageParams,
+
+    database: bool,
+    username: bool,
 }
 
 impl Config {
@@ -169,18 +86,15 @@ impl Config {
         Config {
             host: Host::Tcp(host),
             port,
-            user: None,
             password: None,
             auth_keys: None,
-            dbname: None,
-            options: None,
-            application_name: None,
             ssl_mode: SslMode::Prefer,
             connect_timeout: None,
-            target_session_attrs: TargetSessionAttrs::Any,
             channel_binding: ChannelBinding::Prefer,
-            replication_mode: None,
-            max_backend_message_size: None,
+            server_params: StartupMessageParams::default(),
+
+            database: false,
+            username: false,
         }
     }
 
@@ -188,14 +102,13 @@ impl Config {
     ///
     /// Required.
     pub fn user(&mut self, user: &str) -> &mut Config {
-        self.user = Some(user.to_string());
-        self
+        self.set_param("user", user)
     }
 
     /// Gets the user to authenticate with, if one has been configured with
     /// the `user` method.
-    pub fn get_user(&self) -> Option<&str> {
-        self.user.as_deref()
+    pub fn user_is_set(&self) -> bool {
+        self.username
     }
 
     /// Sets the password to authenticate with.
@@ -231,40 +144,26 @@ impl Config {
     ///
     /// Defaults to the user.
     pub fn dbname(&mut self, dbname: &str) -> &mut Config {
-        self.dbname = Some(dbname.to_string());
-        self
+        self.set_param("database", dbname)
     }
 
     /// Gets the name of the database to connect to, if one has been configured
     /// with the `dbname` method.
-    pub fn get_dbname(&self) -> Option<&str> {
-        self.dbname.as_deref()
+    pub fn db_is_set(&self) -> bool {
+        self.database
     }
 
-    /// Sets command line options used to configure the server.
-    pub fn options(&mut self, options: &str) -> &mut Config {
-        self.options = Some(options.to_string());
+    pub fn set_param(&mut self, name: &str, value: &str) -> &mut Config {
+        if name == "database" {
+            self.database = true;
+        } else if name == "user" {
+            self.username = true;
+        }
+
+        self.server_params.insert(name, value);
         self
     }
 
-    /// Gets the command line options used to configure the server, if the
-    /// options have been set with the `options` method.
-    pub fn get_options(&self) -> Option<&str> {
-        self.options.as_deref()
-    }
-
-    /// Sets the value of the `application_name` runtime parameter.
-    pub fn application_name(&mut self, application_name: &str) -> &mut Config {
-        self.application_name = Some(application_name.to_string());
-        self
-    }
-
-    /// Gets the value of the `application_name` runtime parameter, if it has
-    /// been set with the `application_name` method.
-    pub fn get_application_name(&self) -> Option<&str> {
-        self.application_name.as_deref()
-    }
-
     /// Sets the SSL configuration.
     ///
     /// Defaults to `prefer`.
@@ -303,23 +202,6 @@ impl Config {
         self.connect_timeout.as_ref()
     }
 
-    /// Sets the requirements of the session.
-    ///
-    /// This can be used to connect to the primary server in a clustered database rather than one of the read-only
-    /// secondary servers. Defaults to `Any`.
-    pub fn target_session_attrs(
-        &mut self,
-        target_session_attrs: TargetSessionAttrs,
-    ) -> &mut Config {
-        self.target_session_attrs = target_session_attrs;
-        self
-    }
-
-    /// Gets the requirements of the session.
-    pub fn get_target_session_attrs(&self) -> TargetSessionAttrs {
-        self.target_session_attrs
-    }
-
     /// Sets the channel binding behavior.
     ///
     /// Defaults to `prefer`.
@@ -333,28 +215,6 @@ impl Config {
         self.channel_binding
     }
 
-    /// Set replication mode.
-    pub fn replication_mode(&mut self, replication_mode: ReplicationMode) -> &mut Config {
-        self.replication_mode = Some(replication_mode);
-        self
-    }
-
-    /// Get replication mode.
-    pub fn get_replication_mode(&self) -> Option<ReplicationMode> {
-        self.replication_mode
-    }
-
-    /// Set limit for backend messages size.
-    pub fn max_backend_message_size(&mut self, max_backend_message_size: usize) -> &mut Config {
-        self.max_backend_message_size = Some(max_backend_message_size);
-        self
-    }
-
-    /// Get limit for backend messages size.
-    pub fn get_max_backend_message_size(&self) -> Option<usize> {
-        self.max_backend_message_size
-    }
-
     /// Opens a connection to a PostgreSQL database.
     ///
     /// Requires the `runtime` Cargo feature (enabled by default).
@@ -392,18 +252,13 @@ impl fmt::Debug for Config {
         }
 
         f.debug_struct("Config")
-            .field("user", &self.user)
             .field("password", &self.password.as_ref().map(|_| Redaction {}))
-            .field("dbname", &self.dbname)
-            .field("options", &self.options)
-            .field("application_name", &self.application_name)
             .field("ssl_mode", &self.ssl_mode)
             .field("host", &self.host)
             .field("port", &self.port)
             .field("connect_timeout", &self.connect_timeout)
-            .field("target_session_attrs", &self.target_session_attrs)
             .field("channel_binding", &self.channel_binding)
-            .field("replication", &self.replication_mode)
+            .field("server_params", &self.server_params)
             .finish()
     }
 }
diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs
index 75a58e6eac..e0cb69748d 100644
--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -1,14 +1,11 @@
 use crate::client::SocketConfig;
 use crate::codec::BackendMessage;
-use crate::config::{Host, TargetSessionAttrs};
+use crate::config::Host;
 use crate::connect_raw::connect_raw;
 use crate::connect_socket::connect_socket;
 use crate::tls::{MakeTlsConnect, TlsConnect};
-use crate::{Client, Config, Connection, Error, RawConnection, SimpleQueryMessage};
-use futures_util::{future, pin_mut, Future, FutureExt, Stream};
+use crate::{Client, Config, Connection, Error, RawConnection};
 use postgres_protocol2::message::backend::Message;
-use std::io;
-use std::task::Poll;
 use tokio::net::TcpStream;
 use tokio::sync::mpsc;
 
@@ -72,47 +69,7 @@ where
         .map(|m| BackendMessage::Async(Message::NoticeResponse(m)))
         .collect();
 
-    let mut connection = Connection::new(stream, delayed, parameters, receiver);
-
-    if let TargetSessionAttrs::ReadWrite = config.target_session_attrs {
-        let rows = client.simple_query_raw("SHOW transaction_read_only");
-        pin_mut!(rows);
-
-        let rows = future::poll_fn(|cx| {
-            if connection.poll_unpin(cx)?.is_ready() {
-                return Poll::Ready(Err(Error::closed()));
-            }
-
-            rows.as_mut().poll(cx)
-        })
-        .await?;
-        pin_mut!(rows);
-
-        loop {
-            let next = future::poll_fn(|cx| {
-                if connection.poll_unpin(cx)?.is_ready() {
-                    return Poll::Ready(Some(Err(Error::closed())));
-                }
-
-                rows.as_mut().poll_next(cx)
-            });
-
-            match next.await.transpose()? {
-                Some(SimpleQueryMessage::Row(row)) => {
-                    if row.try_get(0)? == Some("on") {
-                        return Err(Error::connect(io::Error::new(
-                            io::ErrorKind::PermissionDenied,
-                            "database does not allow writes",
-                        )));
-                    } else {
-                        break;
-                    }
-                }
-                Some(_) => {}
-                None => return Err(Error::unexpected_message()),
-            }
-        }
-    }
+    let connection = Connection::new(stream, delayed, parameters, receiver);
 
     Ok((client, connection))
 }
diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs
index 390f133002..66db85e07d 100644
--- a/libs/proxy/tokio-postgres2/src/connect_raw.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs
@@ -1,5 +1,5 @@
 use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
-use crate::config::{self, AuthKeys, Config, ReplicationMode};
+use crate::config::{self, AuthKeys, Config};
 use crate::connect_tls::connect_tls;
 use crate::maybe_tls_stream::MaybeTlsStream;
 use crate::tls::{TlsConnect, TlsStream};
@@ -96,12 +96,7 @@ where
     let stream = connect_tls(stream, config.ssl_mode, tls).await?;
 
     let mut stream = StartupStream {
-        inner: Framed::new(
-            stream,
-            PostgresCodec {
-                max_message_size: config.max_backend_message_size,
-            },
-        ),
+        inner: Framed::new(stream, PostgresCodec),
         buf: BackendMessages::empty(),
         delayed_notice: Vec::new(),
     };
@@ -124,28 +119,8 @@ where
     S: AsyncRead + AsyncWrite + Unpin,
     T: AsyncRead + AsyncWrite + Unpin,
 {
-    let mut params = vec![("client_encoding", "UTF8")];
-    if let Some(user) = &config.user {
-        params.push(("user", &**user));
-    }
-    if let Some(dbname) = &config.dbname {
-        params.push(("database", &**dbname));
-    }
-    if let Some(options) = &config.options {
-        params.push(("options", &**options));
-    }
-    if let Some(application_name) = &config.application_name {
-        params.push(("application_name", &**application_name));
-    }
-    if let Some(replication_mode) = &config.replication_mode {
-        match replication_mode {
-            ReplicationMode::Physical => params.push(("replication", "true")),
-            ReplicationMode::Logical => params.push(("replication", "database")),
-        }
-    }
-
     let mut buf = BytesMut::new();
-    frontend::startup_message(params, &mut buf).map_err(Error::encode)?;
+    frontend::startup_message(&config.server_params, &mut buf).map_err(Error::encode)?;
 
     stream
         .send(FrontendMessage::Raw(buf.freeze()))
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index bcb0ef40bd..7bc5587a25 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -70,11 +70,12 @@ impl ReportableError for CancelError {
 impl<P: CancellationPublisher> CancellationHandler<P> {
     /// Run async action within an ephemeral session identified by [`CancelKeyData`].
     pub(crate) fn get_session(self: Arc<Self>) -> Session<P> {
-        // HACK: We'd rather get the real backend_pid but postgres_client doesn't
-        // expose it and we don't want to do another roundtrip to query
-        // for it. The client will be able to notice that this is not the
-        // actual backend_pid, but backend_pid is not used for anything
-        // so it doesn't matter.
+        // we intentionally generate a random "backend pid" and "secret key" here.
+        // we use the corresponding u64 as an identifier for the
+        // actual endpoint+pid+secret for postgres/pgbouncer.
+        //
+        // if we forwarded the backend_pid from postgres to the client, there would be a lot
+        // of overlap between our computes as most pids are small (~100).
         let key = loop {
             let key = rand::random();
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index ab0ff4b795..4113b5bb80 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -131,49 +131,37 @@ impl ConnCfg {
     }
 
     /// Apply startup message params to the connection config.
-    pub(crate) fn set_startup_params(&mut self, params: &StartupMessageParams) {
-        // Only set `user` if it's not present in the config.
-        // Console redirect auth flow takes username from the console's response.
-        if let (None, Some(user)) = (self.get_user(), params.get("user")) {
-            self.user(user);
+    pub(crate) fn set_startup_params(
+        &mut self,
+        params: &StartupMessageParams,
+        arbitrary_params: bool,
+    ) {
+        if !arbitrary_params {
+            self.set_param("client_encoding", "UTF8");
         }
-
-        // Only set `dbname` if it's not present in the config.
-        // Console redirect auth flow takes dbname from the console's response.
-        if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
-            self.dbname(dbname);
-        }
-
-        // Don't add `options` if they were only used for specifying a project.
-        // Connection pools don't support `options`, because they affect backend startup.
-        if let Some(options) = filtered_options(params) {
-            self.options(&options);
-        }
-
-        if let Some(app_name) = params.get("application_name") {
-            self.application_name(app_name);
-        }
-
-        // TODO: This is especially ugly...
-        if let Some(replication) = params.get("replication") {
-            use postgres_client::config::ReplicationMode;
-            match replication {
-                "true" | "on" | "yes" | "1" => {
-                    self.replication_mode(ReplicationMode::Physical);
+        for (k, v) in params.iter() {
+            match k {
+                // Only set `user` if it's not present in the config.
+                // Console redirect auth flow takes username from the console's response.
+                "user" if self.user_is_set() => continue,
+                "database" if self.db_is_set() => continue,
+                "options" => {
+                    if let Some(options) = filtered_options(v) {
+                        self.set_param(k, &options);
+                    }
                 }
-                "database" => {
-                    self.replication_mode(ReplicationMode::Logical);
+                "user" | "database" | "application_name" | "replication" => {
+                    self.set_param(k, v);
                 }
-                _other => {}
+
+                // if we allow arbitrary params, then we forward them through.
+                // this is a flag for a period of backwards compatibility
+                k if arbitrary_params => {
+                    self.set_param(k, v);
+                }
+                _ => {}
             }
         }
-
-        // TODO: extend the list of the forwarded startup parameters.
-        // Currently, tokio-postgres doesn't allow us to pass
-        // arbitrary parameters, but the ones above are a good start.
-        //
-        // This and the reverse params problem can be better addressed
-        // in a bespoke connection machinery (a new library for that sake).
     }
 }
 
@@ -347,10 +335,9 @@ impl ConnCfg {
 }
 
 /// Retrieve `options` from a startup message, dropping all proxy-secific flags.
-fn filtered_options(params: &StartupMessageParams) -> Option<String> {
+fn filtered_options(options: &str) -> Option<String> {
     #[allow(unstable_name_collisions)]
-    let options: String = params
-        .options_raw()?
+    let options: String = StartupMessageParams::parse_options_raw(options)
         .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
         .intersperse(" ") // TODO: use impl from std once it's stabilized
         .collect();
@@ -427,27 +414,24 @@ mod tests {
     #[test]
     fn test_filtered_options() {
         // Empty options is unlikely to be useful anyway.
-        let params = StartupMessageParams::new([("options", "")]);
-        assert_eq!(filtered_options(&params), None);
+        let params = "";
+        assert_eq!(filtered_options(params), None);
 
         // It's likely that clients will only use options to specify endpoint/project.
-        let params = StartupMessageParams::new([("options", "project=foo")]);
-        assert_eq!(filtered_options(&params), None);
+        let params = "project=foo";
+        assert_eq!(filtered_options(params), None);
 
         // Same, because unescaped whitespaces are no-op.
-        let params = StartupMessageParams::new([("options", " project=foo ")]);
-        assert_eq!(filtered_options(&params).as_deref(), None);
+        let params = " project=foo ";
+        assert_eq!(filtered_options(params).as_deref(), None);
 
-        let params = StartupMessageParams::new([("options", r"\  project=foo \ ")]);
-        assert_eq!(filtered_options(&params).as_deref(), Some(r"\  \ "));
+        let params = r"\  project=foo \ ";
+        assert_eq!(filtered_options(params).as_deref(), Some(r"\  \ "));
 
-        let params = StartupMessageParams::new([("options", "project = foo")]);
-        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+        let params = "project = foo";
+        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
 
-        let params = StartupMessageParams::new([(
-            "options",
-            "project = foo neon_endpoint_type:read_write   neon_lsn:0/2",
-        )]);
-        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+        let params = "project = foo neon_endpoint_type:read_write   neon_lsn:0/2 neon_proxy_params_compat:true";
+        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
     }
 }
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 8f78df1964..7db1179eea 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -206,6 +206,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let mut node = connect_to_compute(
         ctx,
         &TcpMechanism {
+            params_compat: true,
             params: &params,
             locks: &config.connect_compute_locks,
         },
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 585dce7bae..a3027abd7c 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -66,6 +66,8 @@ pub(crate) trait ComputeConnectBackend {
 }
 
 pub(crate) struct TcpMechanism<'a> {
+    pub(crate) params_compat: bool,
+
     /// KV-dictionary with PostgreSQL connection params.
     pub(crate) params: &'a StartupMessageParams,
 
@@ -92,7 +94,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
     }
 
     fn update_connect_config(&self, config: &mut compute::ConnCfg) {
-        config.set_startup_params(self.params);
+        config.set_startup_params(self.params, self.params_compat);
     }
 }
 
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index af97fb3d71..f74eb5940f 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -338,9 +338,17 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         }
     };
 
+    let params_compat = match &user_info {
+        auth::Backend::ControlPlane(_, info) => {
+            info.info.options.get(NeonOptions::PARAMS_COMPAT).is_some()
+        }
+        auth::Backend::Local(_) => false,
+    };
+
     let mut node = connect_to_compute(
         ctx,
         &TcpMechanism {
+            params_compat,
             params: &params,
             locks: &config.connect_compute_locks,
         },
@@ -409,19 +417,47 @@ pub(crate) async fn prepare_client_connection<P>(
 pub(crate) struct NeonOptions(Vec<(SmolStr, SmolStr)>);
 
 impl NeonOptions {
+    // proxy options:
+
+    /// `PARAMS_COMPAT` allows opting in to forwarding all startup parameters from client to compute.
+    const PARAMS_COMPAT: &str = "proxy_params_compat";
+
+    // cplane options:
+
+    /// `LSN` allows provisioning an ephemeral compute with time-travel to the provided LSN.
+    const LSN: &str = "lsn";
+
+    /// `ENDPOINT_TYPE` allows configuring an ephemeral compute to be read_only or read_write.
+    const ENDPOINT_TYPE: &str = "endpoint_type";
+
     pub(crate) fn parse_params(params: &StartupMessageParams) -> Self {
         params
             .options_raw()
             .map(Self::parse_from_iter)
             .unwrap_or_default()
     }
+
     pub(crate) fn parse_options_raw(options: &str) -> Self {
         Self::parse_from_iter(StartupMessageParams::parse_options_raw(options))
     }
 
+    pub(crate) fn get(&self, key: &str) -> Option<SmolStr> {
+        self.0
+            .iter()
+            .find_map(|(k, v)| (k == key).then_some(v))
+            .cloned()
+    }
+
     pub(crate) fn is_ephemeral(&self) -> bool {
-        // Currently, neon endpoint options are all reserved for ephemeral endpoints.
-        !self.0.is_empty()
+        self.0.iter().any(|(k, _)| match &**k {
+            // This is not a cplane option, we know it does not create ephemeral computes.
+            Self::PARAMS_COMPAT => false,
+            Self::LSN => true,
+            Self::ENDPOINT_TYPE => true,
+            // err on the side of caution. any cplane options we don't know about
+            // might lead to ephemeral computes.
+            _ => true,
+        })
     }
 
     fn parse_from_iter<'a>(options: impl Iterator<Item = &'a str>) -> Self {
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index d72331c7bf..59c9ac27b8 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -55,7 +55,13 @@ async fn proxy_mitm(
 
         // give the end_server the startup parameters
         let mut buf = BytesMut::new();
-        frontend::startup_message(startup.iter(), &mut buf).unwrap();
+        frontend::startup_message(
+            &postgres_protocol::message::frontend::StartupMessageParams {
+                params: startup.params.into(),
+            },
+            &mut buf,
+        )
+        .unwrap();
         end_server.send(buf.freeze()).await.unwrap();
 
         // proxy messages between end_client and end_server
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 53345431e3..911b349416 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -252,7 +252,7 @@ async fn handshake_raw() -> anyhow::Result<()> {
     let _conn = postgres_client::Config::new("test".to_owned(), 5432)
         .user("john_doe")
         .dbname("earth")
-        .options("project=generic-project-name")
+        .set_param("options", "project=generic-project-name")
         .ssl_mode(SslMode::Prefer)
         .connect_raw(server, NoTls)
         .await?;
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 55d2e47fd3..251aa47084 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -309,10 +309,13 @@ impl PoolingBackend {
             .config
             .user(&conn_info.user_info.user)
             .dbname(&conn_info.dbname)
-            .options(&format!(
-                "-c pg_session_jwt.jwk={}",
-                serde_json::to_string(&jwk).expect("serializing jwk to json should not fail")
-            ));
+            .set_param(
+                "options",
+                &format!(
+                    "-c pg_session_jwt.jwk={}",
+                    serde_json::to_string(&jwk).expect("serializing jwk to json should not fail")
+                ),
+            );
 
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let (client, connection) = config.connect(postgres_client::NoTls).await?;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9c579373e8..60c4a23936 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -269,7 +269,7 @@ class PgProtocol:
             for match in re.finditer(r"-c(\w*)=(\w*)", options):
                 key = match.group(1)
                 val = match.group(2)
-                if "server_options" in conn_options:
+                if "server_settings" in conn_options:
                     conn_options["server_settings"].update({key: val})
                 else:
                     conn_options["server_settings"] = {key: val}
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 5a01d90d85..d8df2efc78 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -5,6 +5,7 @@ import json
 import subprocess
 import time
 import urllib.parse
+from contextlib import closing
 from typing import TYPE_CHECKING
 
 import psycopg2
@@ -131,6 +132,24 @@ def test_proxy_options(static_proxy: NeonProxy, option_name: str):
     assert out[0][0] == " str"
 
 
+@pytest.mark.asyncio
+async def test_proxy_arbitrary_params(static_proxy: NeonProxy):
+    with closing(
+        await static_proxy.connect_async(server_settings={"IntervalStyle": "iso_8601"})
+    ) as conn:
+        out = await conn.fetchval("select to_json('0 seconds'::interval)")
+        assert out == '"00:00:00"'
+
+    options = "neon_proxy_params_compat:true"
+    with closing(
+        await static_proxy.connect_async(
+            server_settings={"IntervalStyle": "iso_8601", "options": options}
+        )
+    ) as conn:
+        out = await conn.fetchval("select to_json('0 seconds'::interval)")
+        assert out == '"PT0S"'
+
+
 def test_auth_errors(static_proxy: NeonProxy):
     """
     Check that we throw very specific errors in some unsuccessful auth scenarios.

From bbb050459bc0b9ee99fee937498c77fa6e70edfc Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 4 Dec 2024 14:05:31 +0100
Subject: [PATCH 187/209] feat(compute): Set default application_name for
 pgbouncer connections (#9973)

## Problem

When client specifies `application_name`, pgbouncer propagates it to the
Postgres. Yet, if client doesn't do it, we have hard time figuring out
who opens a lot of Postgres connections (including the `cloud_admin`
ones).

See this investigation as an example:
https://neondb.slack.com/archives/C0836R0RZ0D

## Summary of changes

I haven't found this documented, but it looks like pgbouncer accepts
standard Postgres connstring parameters in the connstring in the
`[databases]` section, so put the default `application_name=pgbouncer`
there. That way, we will always see who opens Postgres connections. I
did tests, and if client specifies a `application_name`, pgbouncer
overrides this default, so it only works if it's not specified or set to
blank `&application_name=` in the connection string.

This is the last place we could potentially open some Postgres
connections without `application_name`. Everything else should be either
of two:
1. Direct client connections without `application_name`, but these
should be strictly non-`cloud_admin` ones
2. Some ad-hoc internal connections, so if we see spikes of unidentified
`cloud_admin` connections, we will need to investigate it again.

Fixes neondatabase/cloud#20948
---
 compute/etc/pgbouncer.ini | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/compute/etc/pgbouncer.ini b/compute/etc/pgbouncer.ini
index cb994f961c..abcd165636 100644
--- a/compute/etc/pgbouncer.ini
+++ b/compute/etc/pgbouncer.ini
@@ -1,5 +1,9 @@
 [databases]
-*=host=localhost port=5432 auth_user=cloud_admin
+;; pgbouncer propagates application_name (if it's specified) to the server, but some
+;; clients don't set it. We set default application_name=pgbouncer to make it
+;; easier to identify pgbouncer connections in Postgres. If client sets
+;; application_name, it will be used instead.
+*=host=localhost port=5432 auth_user=cloud_admin application_name=pgbouncer
 [pgbouncer]
 listen_port=6432
 listen_addr=0.0.0.0

From 8c6b41daf5333b09664542e235e64fa51e232fbe Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 4 Dec 2024 14:05:53 +0100
Subject: [PATCH 188/209] Display reqwest error source (#10004)

## Problem

Reqwest errors don't include details about the inner source error. This
means that we get opaque errors like:

```
receive body: error sending request for url (http://localhost:9898/v1/location_config)
```

Instead of the more helpful:

```
receive body: error sending request for url (http://localhost:9898/v1/location_config): operation timed out
```

Touches #9801.

## Summary of changes

Include the source error for `reqwest::Error` wherever it's displayed.
---
 control_plane/src/safekeeper.rs              |  3 ++-
 pageserver/client/src/mgmt_api.rs            |  6 +++---
 pageserver/src/consumption_metrics/upload.rs |  7 ++++++-
 safekeeper/src/http/client.rs                |  3 ++-
 storage_controller/src/compute_hook.rs       |  3 ++-
 storage_controller/src/peer_client.rs        | 11 ++++++++---
 storage_scrubber/src/cloud_admin_api.rs      | 14 ++++++++++----
 7 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index 7a019bce88..f0c3722925 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -5,6 +5,7 @@
 //! ```text
 //!   .neon/safekeepers/<safekeeper id>
 //! ```
+use std::error::Error as _;
 use std::future::Future;
 use std::io::Write;
 use std::path::PathBuf;
@@ -26,7 +27,7 @@ use crate::{
 
 #[derive(Error, Debug)]
 pub enum SafekeeperHttpError {
-    #[error("Reqwest error: {0}")]
+    #[error("request error: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
     Transport(#[from] reqwest::Error),
 
     #[error("Error: {0}")]
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 4d76c66905..c3a1ef8140 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::{collections::HashMap, error::Error as _};
 
 use bytes::Bytes;
 use detach_ancestor::AncestorDetached;
@@ -25,10 +25,10 @@ pub struct Client {
 
 #[derive(thiserror::Error, Debug)]
 pub enum Error {
-    #[error("send request: {0}")]
+    #[error("send request: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
     SendRequest(reqwest::Error),
 
-    #[error("receive body: {0}")]
+    #[error("receive body: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
     ReceiveBody(reqwest::Error),
 
     #[error("receive error body: {0}")]
diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs
index 1cb4e917c0..448bf47525 100644
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -1,3 +1,4 @@
+use std::error::Error as _;
 use std::time::SystemTime;
 
 use chrono::{DateTime, Utc};
@@ -350,7 +351,11 @@ impl std::fmt::Display for UploadError {
 
         match self {
             Rejected(code) => write!(f, "server rejected the metrics with {code}"),
-            Reqwest(e) => write!(f, "request failed: {e}"),
+            Reqwest(e) => write!(
+                f,
+                "request failed: {e}{}",
+                e.source().map(|e| format!(": {e}")).unwrap_or_default()
+            ),
             Cancelled => write!(f, "cancelled"),
         }
     }
diff --git a/safekeeper/src/http/client.rs b/safekeeper/src/http/client.rs
index c56f7880d4..a166fc1ab9 100644
--- a/safekeeper/src/http/client.rs
+++ b/safekeeper/src/http/client.rs
@@ -8,6 +8,7 @@
 //! etc.
 
 use reqwest::{IntoUrl, Method, StatusCode};
+use std::error::Error as _;
 use utils::{
     http::error::HttpErrorBody,
     id::{NodeId, TenantId, TimelineId},
@@ -26,7 +27,7 @@ pub struct Client {
 #[derive(thiserror::Error, Debug)]
 pub enum Error {
     /// Failed to receive body (reqwest error).
-    #[error("receive body: {0}")]
+    #[error("receive body: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
     ReceiveBody(reqwest::Error),
 
     /// Status is not ok, but failed to parse body as `HttpErrorBody`.
diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index b63a322b87..2b2ece3f02 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -1,3 +1,4 @@
+use std::error::Error as _;
 use std::sync::Arc;
 use std::{collections::HashMap, time::Duration};
 
@@ -172,7 +173,7 @@ struct ComputeHookNotifyRequest {
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum NotifyError {
     // Request was not send successfully, e.g. transport error
-    #[error("Sending request: {0}")]
+    #[error("Sending request: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
     Request(#[from] reqwest::Error),
     // Request could not be serviced right now due to ongoing Operation in control plane, but should be possible soon.
     #[error("Control plane tenant busy")]
diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs
index 3f8520fe55..ee4eb55294 100644
--- a/storage_controller/src/peer_client.rs
+++ b/storage_controller/src/peer_client.rs
@@ -1,7 +1,9 @@
 use crate::tenant_shard::ObservedState;
 use pageserver_api::shard::TenantShardId;
 use serde::{Deserialize, Serialize};
-use std::{collections::HashMap, time::Duration};
+use std::collections::HashMap;
+use std::error::Error as _;
+use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 
 use hyper::Uri;
@@ -17,11 +19,14 @@ pub(crate) struct PeerClient {
 
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum StorageControllerPeerError {
-    #[error("failed to deserialize error response with status code {0} at {1}: {2}")]
+    #[error(
+        "failed to deserialize error response with status code {0} at {1}: {2}{}",
+        .2.source().map(|e| format!(": {e}")).unwrap_or_default()
+    )]
     DeserializationError(StatusCode, Url, reqwest::Error),
     #[error("storage controller peer API error ({0}): {1}")]
     ApiError(StatusCode, String),
-    #[error("failed to send HTTP request: {0}")]
+    #[error("failed to send HTTP request: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
     SendError(reqwest::Error),
     #[error("Cancelled")]
     Cancelled,
diff --git a/storage_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs
index c9a62cd256..b1dfe3a53f 100644
--- a/storage_scrubber/src/cloud_admin_api.rs
+++ b/storage_scrubber/src/cloud_admin_api.rs
@@ -1,3 +1,5 @@
+use std::error::Error as _;
+
 use chrono::{DateTime, Utc};
 use futures::Future;
 use hex::FromHex;
@@ -30,14 +32,18 @@ impl std::fmt::Display for Error {
         match &self.kind {
             ErrorKind::RequestSend(e) => write!(
                 f,
-                "Failed to send a request. Context: {}, error: {}",
-                self.context, e
+                "Failed to send a request. Context: {}, error: {}{}",
+                self.context,
+                e,
+                e.source().map(|e| format!(": {e}")).unwrap_or_default()
             ),
             ErrorKind::BodyRead(e) => {
                 write!(
                     f,
-                    "Failed to read a request body. Context: {}, error: {}",
-                    self.context, e
+                    "Failed to read a request body. Context: {}, error: {}{}",
+                    self.context,
+                    e,
+                    e.source().map(|e| format!(": {e}")).unwrap_or_default()
                 )
             }
             ErrorKind::ResponseStatus(status) => {

From d550e3f6263d988166c84ac62358abdd7e01f948 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Wed, 4 Dec 2024 14:10:00 +0100
Subject: [PATCH 189/209] Create a branch for compute release (#9637)

## Problem
We practice a manual release flow for the compute module. This will
allow automation of the compute release process.

## Summary of changes
The workflow was modified to make a compute release automatically on the
branch release-compute.
## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 .../actions/allure-report-generate/action.yml |  3 ++-
 .../actions/allure-report-store/action.yml    |  3 ++-
 .github/workflows/_create-release-pr.yml      |  2 +-
 .github/workflows/build_and_test.yml          | 23 +++++++++++--------
 .github/workflows/release.yml                 | 23 ++++++++++++++++---
 .github/workflows/trigger-e2e-tests.yml       |  2 ++
 6 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index d1d09223db..d6219c31b4 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -43,7 +43,8 @@ runs:
         PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
         if [ "${PR_NUMBER}" != "null" ]; then
           BRANCH_OR_PR=pr-${PR_NUMBER}
-        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \
+             [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then
           # Shortcut for special branches
           BRANCH_OR_PR=${GITHUB_REF_NAME}
         else
diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml
index 9c376f420a..3c83656c89 100644
--- a/.github/actions/allure-report-store/action.yml
+++ b/.github/actions/allure-report-store/action.yml
@@ -23,7 +23,8 @@ runs:
         PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
         if [ "${PR_NUMBER}" != "null" ]; then
           BRANCH_OR_PR=pr-${PR_NUMBER}
-        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \
+             [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then
           # Shortcut for special branches
           BRANCH_OR_PR=${GITHUB_REF_NAME}
         else
diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml
index cc6994397f..3c130c8229 100644
--- a/.github/workflows/_create-release-pr.yml
+++ b/.github/workflows/_create-release-pr.yml
@@ -21,7 +21,7 @@ defaults:
     shell: bash -euo pipefail {0}
 
 jobs:
-  create-storage-release-branch:
+  create-release-branch:
     runs-on: ubuntu-22.04
 
     permissions:
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e9e111e7bd..cb966f292e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -6,6 +6,7 @@ on:
       - main
       - release
       - release-proxy
+      - release-compute
   pull_request:
 
 defaults:
@@ -70,8 +71,10 @@ jobs:
             echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
           elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
             echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
           else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'"
             echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
           fi
         shell: bash
@@ -513,7 +516,7 @@ jobs:
             })
 
   trigger-e2e-tests:
-    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }}
+    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }}
     needs: [ check-permissions, promote-images, tag ]
     uses: ./.github/workflows/trigger-e2e-tests.yml
     secrets: inherit
@@ -934,7 +937,7 @@ jobs:
                                               neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
 
       - name: Configure AWS-prod credentials
-        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
         uses: aws-actions/configure-aws-credentials@v4
         with:
           aws-region: eu-central-1
@@ -943,12 +946,12 @@ jobs:
 
       - name: Login to prod ECR
         uses: docker/login-action@v3
-        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
         with:
           registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com
 
       - name: Copy all images to prod ECR
-        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
         run: |
           for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do
             docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
@@ -968,7 +971,7 @@ jobs:
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
 
   push-to-acr-prod:
-    if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+    if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
     needs: [ tag, promote-images ]
     uses: ./.github/workflows/_push-to-acr.yml
     with:
@@ -1056,7 +1059,7 @@ jobs:
   deploy:
     needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
     # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
-    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
+    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled()
 
     runs-on: [ self-hosted, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
@@ -1105,13 +1108,15 @@ jobs:
               -f deployProxyAuthBroker=true \
               -f branch=main \
               -f dockerTag=${{needs.tag.outputs.build-tag}}
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.tag.outputs.build-tag}}
           else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main', 'release', 'release-proxy' or 'release-compute'"
             exit 1
           fi
 
       - name: Create git tag
-        if: github.ref_name == 'release' || github.ref_name == 'release-proxy'
+        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
         uses: actions/github-script@v7
         with:
           # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 11f010b6d4..f0273b977f 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -15,6 +15,10 @@ on:
         type: boolean
         description: 'Create Proxy release PR'
         required: false
+      create-compute-release-branch:
+        type: boolean
+        description: 'Create Compute release PR'
+        required: false
 
 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
@@ -25,20 +29,20 @@ defaults:
 
 jobs:
   create-storage-release-branch:
-    if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
+    if: ${{ github.event.schedule == '0 6 * * MON' || inputs.create-storage-release-branch }}
 
     permissions:
       contents: write
 
     uses: ./.github/workflows/_create-release-pr.yml
     with:
-      component-name: 'Storage & Compute'
+      component-name: 'Storage'
       release-branch: 'release'
     secrets:
       ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
 
   create-proxy-release-branch:
-    if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
+    if: ${{ github.event.schedule == '0 6 * * THU' || inputs.create-proxy-release-branch }}
 
     permissions:
       contents: write
@@ -49,3 +53,16 @@ jobs:
       release-branch: 'release-proxy'
     secrets:
       ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
+
+  create-compute-release-branch:
+    if: inputs.create-compute-release-branch
+
+    permissions:
+      contents: write
+
+    uses: ./.github/workflows/_create-release-pr.yml
+    with:
+      component-name: 'Compute'
+      release-branch: 'release-compute'
+    secrets:
+      ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 1e7264c55a..70c2e8549f 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -51,6 +51,8 @@ jobs:
             echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
           elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
             echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
           else
             echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
             BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')

From 088eb72dd7a3d104cf854bf87be30ba922b52c90 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 4 Dec 2024 15:04:04 +0000
Subject: [PATCH 190/209] tests: make storcon scale test AZ-aware (#9952)

## Problem

We have a scale test for the storage controller which also acts as a
good stress test for scheduling stability. However, it created nodes
with no AZs set.

## Summary of changes

- Bump node count to 6 and set AZs on them.

This is a precursor to other AZ-related PRs, to make sure any new code
that's landed is getting scale tested in an AZ-aware environment.
---
 test_runner/performance/test_storage_controller_scale.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 142bd3d669..49f41483ec 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -72,7 +72,7 @@ def test_storage_controller_many_tenants(
     we don't fall over for a thousand shards.
     """
 
-    neon_env_builder.num_pageservers = 5
+    neon_env_builder.num_pageservers = 6
     neon_env_builder.storage_controller_config = {
         # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
         # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to
@@ -84,6 +84,11 @@ def test_storage_controller_many_tenants(
         compute_reconfigure_listener.control_plane_compute_hook_api
     )
 
+    AZS = ["alpha", "bravo", "charlie"]
+    neon_env_builder.pageserver_config_override = lambda ps_cfg: ps_cfg.update(
+        {"availability_zone": f"az-{AZS[ps_cfg['id'] % len(AZS)]}"}
+    )
+
     # A small sleep on each call into the notify hook, to simulate the latency of doing a database write
     compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01))
 

From 7b0e3db868ff251f5b3f043c93ea8d59fb8e929e Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Wed, 4 Dec 2024 11:54:56 -0500
Subject: [PATCH 191/209] pageserver: make `BufferedWriter` do double-buffering
 (#9693)

Closes #9387.

## Problem

`BufferedWriter` cannot proceed while the owned buffer is flushing to
disk. We want to implement double buffering so that the flush can happen
in the background. See #9387.

## Summary of changes

- Maintain two owned buffers in `BufferedWriter`.
- The writer is in charge of copying the data into owned, aligned
buffer, once full, submit it to the flush task.
- The flush background task is in charge of flushing the owned buffer to
disk, and returned the buffer to the writer for reuse.
- The writer and the flush background task communicate through a
bi-directional channel.

For in-memory layer, we also need to be able to read from the buffered
writer in `get_values_reconstruct_data`. To handle this case, we did the
following
- Use replace `VirtualFile::write_all` with `VirtualFile::write_all_at`,
and use `Arc` to share it between writer and background task.
- leverage `IoBufferMut::freeze` to get a cheaply clonable `IoBuffer`,
one clone will be submitted to the channel, the other clone will be
saved within the writer to serve reads. When we want to reuse the
buffer, we can invoke `IoBuffer::into_mut`, which gives us back the
mutable aligned buffer.
- InMemoryLayer reads is now aware of the maybe_flushed part of the
buffer.

**Caveat**

- We removed the owned version of write, because this interface does not
work well with buffer alignment. The result is that without direct IO
enabled,
[`download_object`](https://github.com/neondatabase/neon/blob/a439d57050dafd603d24e001215213eb5246a029/pageserver/src/tenant/remote_timeline_client/download.rs#L243)
does one more memcpy than before this PR due to the switch to use
`_borrowed` version of the write.
- "Bypass aligned part of write" could be implemented later to avoid
large amount of memcpy.

**Testing**
- use an oneshot channel based control mechanism to make flush behavior
deterministic in test.
- test reading from `EphemeralFile` when the last submitted buffer is
not flushed, in-progress, and done flushing to disk.


## Performance


We see performance improvement for small values, and regression on big
values, likely due to being CPU bound + disk write latency.


[Results](https://www.notion.so/neondatabase/Benchmarking-New-BufferedWriter-11-20-2024-143f189e0047805ba99acda89f984d51?pvs=4)


## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 libs/utils/src/sync.rs                        |   1 +
 libs/utils/src/sync/duplex.rs                 |   1 +
 libs/utils/src/sync/duplex/mpsc.rs            |  36 ++
 pageserver/benches/bench_ingest.rs            |   4 +-
 pageserver/src/tenant/ephemeral_file.rs       | 286 +++++++++-----
 .../src/tenant/remote_timeline_client.rs      |   2 +
 .../tenant/remote_timeline_client/download.rs |  44 ++-
 pageserver/src/tenant/secondary/downloader.rs |   1 +
 .../tenant/storage_layer/inmemory_layer.rs    |   5 +-
 pageserver/src/tenant/storage_layer/layer.rs  |   1 +
 pageserver/src/tenant/timeline.rs             |   3 +-
 .../src/tenant/timeline/layer_manager.rs      |  14 +-
 pageserver/src/virtual_file.rs                |  26 +-
 .../aligned_buffer/alignment.rs               |   4 +-
 .../owned_buffers_io/aligned_buffer/buffer.rs |  18 +-
 .../aligned_buffer/buffer_mut.rs              |  43 ++-
 .../owned_buffers_io/io_buf_aligned.rs        |  10 +-
 .../owned_buffers_io/io_buf_ext.rs            |  14 +
 .../util/size_tracking_writer.rs              |  50 ---
 .../virtual_file/owned_buffers_io/write.rs    | 358 +++++++++---------
 .../owned_buffers_io/write/flush.rs           | 314 +++++++++++++++
 21 files changed, 846 insertions(+), 389 deletions(-)
 create mode 100644 libs/utils/src/sync/duplex.rs
 create mode 100644 libs/utils/src/sync/duplex/mpsc.rs
 delete mode 100644 pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/write/flush.rs

diff --git a/libs/utils/src/sync.rs b/libs/utils/src/sync.rs
index 7aa26e24bc..280637de8f 100644
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -1,5 +1,6 @@
 pub mod heavier_once_cell;
 
+pub mod duplex;
 pub mod gate;
 
 pub mod spsc_fold;
diff --git a/libs/utils/src/sync/duplex.rs b/libs/utils/src/sync/duplex.rs
new file mode 100644
index 0000000000..fac79297a0
--- /dev/null
+++ b/libs/utils/src/sync/duplex.rs
@@ -0,0 +1 @@
+pub mod mpsc;
diff --git a/libs/utils/src/sync/duplex/mpsc.rs b/libs/utils/src/sync/duplex/mpsc.rs
new file mode 100644
index 0000000000..56b4e6d2b3
--- /dev/null
+++ b/libs/utils/src/sync/duplex/mpsc.rs
@@ -0,0 +1,36 @@
+use tokio::sync::mpsc;
+
+/// A bi-directional channel.
+pub struct Duplex<S, R> {
+    pub tx: mpsc::Sender<S>,
+    pub rx: mpsc::Receiver<R>,
+}
+
+/// Creates a bi-directional channel.
+///
+/// The channel will buffer up to the provided number of messages. Once the buffer is full,
+/// attempts to send new messages will wait until a message is received from the channel.
+/// The provided buffer capacity must be at least 1.
+pub fn channel<A: Send, B: Send>(buffer: usize) -> (Duplex<A, B>, Duplex<B, A>) {
+    let (tx_a, rx_a) = mpsc::channel::<A>(buffer);
+    let (tx_b, rx_b) = mpsc::channel::<B>(buffer);
+
+    (Duplex { tx: tx_a, rx: rx_b }, Duplex { tx: tx_b, rx: rx_a })
+}
+
+impl<S: Send, R: Send> Duplex<S, R> {
+    /// Sends a value, waiting until there is capacity.
+    ///
+    /// A successful send occurs when it is determined that the other end of the channel has not hung up already.
+    pub async fn send(&self, x: S) -> Result<(), mpsc::error::SendError<S>> {
+        self.tx.send(x).await
+    }
+
+    /// Receives the next value for this receiver.
+    ///
+    /// This method returns `None` if the channel has been closed and there are
+    /// no remaining messages in the channel's buffer.
+    pub async fn recv(&mut self) -> Option<R> {
+        self.rx.recv().await
+    }
+}
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index caacd365b3..b67a9cc479 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -62,10 +62,8 @@ async fn ingest(
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
 
     let gate = utils::sync::gate::Gate::default();
-    let entered = gate.enter().unwrap();
 
-    let layer =
-        InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
+    let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &gate, &ctx).await?;
 
     let data = Value::Image(Bytes::from(vec![0u8; put_size]));
     let data_ser_size = data.serialized_size().unwrap() as usize;
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index de0abab4c0..aaec8a4c31 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -8,10 +8,8 @@ use crate::page_cache;
 use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
 use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
 use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
-use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
 use crate::virtual_file::owned_buffers_io::write::Buffer;
 use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile};
-use bytes::BytesMut;
 use camino::Utf8PathBuf;
 use num_traits::Num;
 use pageserver_api::shard::TenantShardId;
@@ -20,6 +18,7 @@ use tracing::error;
 
 use std::io;
 use std::sync::atomic::AtomicU64;
+use std::sync::Arc;
 use utils::id::TimelineId;
 
 pub struct EphemeralFile {
@@ -27,10 +26,7 @@ pub struct EphemeralFile {
     _timeline_id: TimelineId,
     page_cache_file_id: page_cache::FileId,
     bytes_written: u64,
-    buffered_writer: owned_buffers_io::write::BufferedWriter<
-        BytesMut,
-        size_tracking_writer::Writer<VirtualFile>,
-    >,
+    buffered_writer: owned_buffers_io::write::BufferedWriter<IoBufferMut, VirtualFile>,
     /// Gate guard is held on as long as we need to do operations in the path (delete on drop)
     _gate_guard: utils::sync::gate::GateGuard,
 }
@@ -42,9 +38,9 @@ impl EphemeralFile {
         conf: &PageServerConf,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
-        gate_guard: utils::sync::gate::GateGuard,
+        gate: &utils::sync::gate::Gate,
         ctx: &RequestContext,
-    ) -> Result<EphemeralFile, io::Error> {
+    ) -> anyhow::Result<EphemeralFile> {
         static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
         let filename_disambiguator =
             NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
@@ -55,15 +51,17 @@ impl EphemeralFile {
                 "ephemeral-{filename_disambiguator}"
             )));
 
-        let file = VirtualFile::open_with_options(
-            &filename,
-            virtual_file::OpenOptions::new()
-                .read(true)
-                .write(true)
-                .create(true),
-            ctx,
-        )
-        .await?;
+        let file = Arc::new(
+            VirtualFile::open_with_options_v2(
+                &filename,
+                virtual_file::OpenOptions::new()
+                    .read(true)
+                    .write(true)
+                    .create(true),
+                ctx,
+            )
+            .await?,
+        );
 
         let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore
 
@@ -73,10 +71,12 @@ impl EphemeralFile {
             page_cache_file_id,
             bytes_written: 0,
             buffered_writer: owned_buffers_io::write::BufferedWriter::new(
-                size_tracking_writer::Writer::new(file),
-                BytesMut::with_capacity(TAIL_SZ),
+                file,
+                || IoBufferMut::with_capacity(TAIL_SZ),
+                gate.enter()?,
+                ctx,
             ),
-            _gate_guard: gate_guard,
+            _gate_guard: gate.enter()?,
         })
     }
 }
@@ -85,7 +85,7 @@ impl Drop for EphemeralFile {
     fn drop(&mut self) {
         // unlink the file
         // we are clear to do this, because we have entered a gate
-        let path = self.buffered_writer.as_inner().as_inner().path();
+        let path = self.buffered_writer.as_inner().path();
         let res = std::fs::remove_file(path);
         if let Err(e) = res {
             if e.kind() != std::io::ErrorKind::NotFound {
@@ -132,6 +132,18 @@ impl EphemeralFile {
         srcbuf: &[u8],
         ctx: &RequestContext,
     ) -> std::io::Result<u64> {
+        let (pos, control) = self.write_raw_controlled(srcbuf, ctx).await?;
+        if let Some(control) = control {
+            control.release().await;
+        }
+        Ok(pos)
+    }
+
+    async fn write_raw_controlled(
+        &mut self,
+        srcbuf: &[u8],
+        ctx: &RequestContext,
+    ) -> std::io::Result<(u64, Option<owned_buffers_io::write::FlushControl>)> {
         let pos = self.bytes_written;
 
         let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
@@ -145,9 +157,9 @@ impl EphemeralFile {
         })?;
 
         // Write the payload
-        let nwritten = self
+        let (nwritten, control) = self
             .buffered_writer
-            .write_buffered_borrowed(srcbuf, ctx)
+            .write_buffered_borrowed_controlled(srcbuf, ctx)
             .await?;
         assert_eq!(
             nwritten,
@@ -157,7 +169,7 @@ impl EphemeralFile {
 
         self.bytes_written = new_bytes_written;
 
-        Ok(pos)
+        Ok((pos, control))
     }
 }
 
@@ -168,11 +180,12 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
         dst: tokio_epoll_uring::Slice<B>,
         ctx: &'a RequestContext,
     ) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
-        let file_size_tracking_writer = self.buffered_writer.as_inner();
-        let flushed_offset = file_size_tracking_writer.bytes_written();
+        let submitted_offset = self.buffered_writer.bytes_submitted();
 
-        let buffer = self.buffered_writer.inspect_buffer();
-        let buffered = &buffer[0..buffer.pending()];
+        let mutable = self.buffered_writer.inspect_mutable();
+        let mutable = &mutable[0..mutable.pending()];
+
+        let maybe_flushed = self.buffered_writer.inspect_maybe_flushed();
 
         let dst_cap = dst.bytes_total().into_u64();
         let end = {
@@ -197,11 +210,42 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
                 }
             }
         }
-        let written_range = Range(start, std::cmp::min(end, flushed_offset));
-        let buffered_range = Range(std::cmp::max(start, flushed_offset), end);
+
+        let (written_range, maybe_flushed_range) = {
+            if maybe_flushed.is_some() {
+                // [       written       ][ maybe_flushed ][    mutable    ]
+                //                        <-   TAIL_SZ   -><-   TAIL_SZ   ->
+                //                                         ^
+                //                                 `submitted_offset`
+                // <++++++ on disk +++++++????????????????>
+                (
+                    Range(
+                        start,
+                        std::cmp::min(end, submitted_offset.saturating_sub(TAIL_SZ as u64)),
+                    ),
+                    Range(
+                        std::cmp::max(start, submitted_offset.saturating_sub(TAIL_SZ as u64)),
+                        std::cmp::min(end, submitted_offset),
+                    ),
+                )
+            } else {
+                // [       written                        ][    mutable    ]
+                //                                         <-   TAIL_SZ   ->
+                //                                         ^
+                //                                 `submitted_offset`
+                // <++++++ on disk +++++++++++++++++++++++>
+                (
+                    Range(start, std::cmp::min(end, submitted_offset)),
+                    // zero len
+                    Range(submitted_offset, u64::MIN),
+                )
+            }
+        };
+
+        let mutable_range = Range(std::cmp::max(start, submitted_offset), end);
 
         let dst = if written_range.len() > 0 {
-            let file: &VirtualFile = file_size_tracking_writer.as_inner();
+            let file: &VirtualFile = self.buffered_writer.as_inner();
             let bounds = dst.bounds();
             let slice = file
                 .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
@@ -211,19 +255,21 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
             dst
         };
 
-        let dst = if buffered_range.len() > 0 {
-            let offset_in_buffer = buffered_range
+        let dst = if maybe_flushed_range.len() > 0 {
+            let offset_in_buffer = maybe_flushed_range
                 .0
-                .checked_sub(flushed_offset)
+                .checked_sub(submitted_offset.saturating_sub(TAIL_SZ as u64))
                 .unwrap()
                 .into_usize();
-            let to_copy =
-                &buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())];
+            // Checked previously the buffer is Some.
+            let maybe_flushed = maybe_flushed.unwrap();
+            let to_copy = &maybe_flushed
+                [offset_in_buffer..(offset_in_buffer + maybe_flushed_range.len().into_usize())];
             let bounds = dst.bounds();
             let mut view = dst.slice({
                 let start = written_range.len().into_usize();
                 let end = start
-                    .checked_add(buffered_range.len().into_usize())
+                    .checked_add(maybe_flushed_range.len().into_usize())
                     .unwrap();
                 start..end
             });
@@ -234,6 +280,28 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
             dst
         };
 
+        let dst = if mutable_range.len() > 0 {
+            let offset_in_buffer = mutable_range
+                .0
+                .checked_sub(submitted_offset)
+                .unwrap()
+                .into_usize();
+            let to_copy =
+                &mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())];
+            let bounds = dst.bounds();
+            let mut view = dst.slice({
+                let start =
+                    written_range.len().into_usize() + maybe_flushed_range.len().into_usize();
+                let end = start.checked_add(mutable_range.len().into_usize()).unwrap();
+                start..end
+            });
+            view.as_mut_rust_slice_full_zeroed()
+                .copy_from_slice(to_copy);
+            Slice::from_buf_bounds(Slice::into_inner(view), bounds)
+        } else {
+            dst
+        };
+
         // TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs
 
         Ok((dst, (end - start).into_usize()))
@@ -295,7 +363,7 @@ mod tests {
 
         let gate = utils::sync::gate::Gate::default();
 
-        let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
+        let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
             .await
             .unwrap();
 
@@ -326,14 +394,15 @@ mod tests {
 
         let gate = utils::sync::gate::Gate::default();
 
-        let mut file =
-            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-                .await
-                .unwrap();
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
+            .await
+            .unwrap();
 
-        let cap = file.buffered_writer.inspect_buffer().capacity();
+        let mutable = file.buffered_writer.inspect_mutable();
+        let cap = mutable.capacity();
+        let align = mutable.align();
 
-        let write_nbytes = cap + cap / 2;
+        let write_nbytes = cap * 2 + cap / 2;
 
         let content: Vec<u8> = rand::thread_rng()
             .sample_iter(rand::distributions::Standard)
@@ -341,30 +410,39 @@ mod tests {
             .collect();
 
         let mut value_offsets = Vec::new();
-        for i in 0..write_nbytes {
-            let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap();
+        for range in (0..write_nbytes)
+            .step_by(align)
+            .map(|start| start..(start + align).min(write_nbytes))
+        {
+            let off = file.write_raw(&content[range], &ctx).await.unwrap();
             value_offsets.push(off);
         }
 
-        assert!(file.len() as usize == write_nbytes);
-        for i in 0..write_nbytes {
-            assert_eq!(value_offsets[i], i.into_u64());
-            let buf = IoBufferMut::with_capacity(1);
+        assert_eq!(file.len() as usize, write_nbytes);
+        for (i, range) in (0..write_nbytes)
+            .step_by(align)
+            .map(|start| start..(start + align).min(write_nbytes))
+            .enumerate()
+        {
+            assert_eq!(value_offsets[i], range.start.into_u64());
+            let buf = IoBufferMut::with_capacity(range.len());
             let (buf_slice, nread) = file
-                .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
+                .read_exact_at_eof_ok(range.start.into_u64(), buf.slice_full(), &ctx)
                 .await
                 .unwrap();
             let buf = buf_slice.into_inner();
-            assert_eq!(nread, 1);
-            assert_eq!(&buf, &content[i..i + 1]);
+            assert_eq!(nread, range.len());
+            assert_eq!(&buf, &content[range]);
         }
 
-        let file_contents =
-            std::fs::read(file.buffered_writer.as_inner().as_inner().path()).unwrap();
-        assert_eq!(file_contents, &content[0..cap]);
+        let file_contents = std::fs::read(file.buffered_writer.as_inner().path()).unwrap();
+        assert!(file_contents == content[0..cap * 2]);
 
-        let buffer_contents = file.buffered_writer.inspect_buffer();
-        assert_eq!(buffer_contents, &content[cap..write_nbytes]);
+        let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap();
+        assert_eq!(&maybe_flushed_buffer_contents[..], &content[cap..cap * 2]);
+
+        let mutable_buffer_contents = file.buffered_writer.inspect_mutable();
+        assert_eq!(mutable_buffer_contents, &content[cap * 2..write_nbytes]);
     }
 
     #[tokio::test]
@@ -373,16 +451,16 @@ mod tests {
 
         let gate = utils::sync::gate::Gate::default();
 
-        let mut file =
-            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-                .await
-                .unwrap();
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
+            .await
+            .unwrap();
 
-        let cap = file.buffered_writer.inspect_buffer().capacity();
+        // mutable buffer and maybe_flushed buffer each has `cap` bytes.
+        let cap = file.buffered_writer.inspect_mutable().capacity();
 
         let content: Vec<u8> = rand::thread_rng()
             .sample_iter(rand::distributions::Standard)
-            .take(cap + cap / 2)
+            .take(cap * 2 + cap / 2)
             .collect();
 
         file.write_raw(&content, &ctx).await.unwrap();
@@ -390,23 +468,21 @@ mod tests {
         // assert the state is as this test expects it to be
         assert_eq!(
             &file.load_to_io_buf(&ctx).await.unwrap(),
-            &content[0..cap + cap / 2]
+            &content[0..cap * 2 + cap / 2]
         );
-        let md = file
-            .buffered_writer
-            .as_inner()
-            .as_inner()
-            .path()
-            .metadata()
-            .unwrap();
+        let md = file.buffered_writer.as_inner().path().metadata().unwrap();
         assert_eq!(
             md.len(),
-            cap.into_u64(),
-            "buffered writer does one write if we write 1.5x buffer capacity"
+            2 * cap.into_u64(),
+            "buffered writer requires one write to be flushed if we write 2.5x buffer capacity"
         );
         assert_eq!(
-            &file.buffered_writer.inspect_buffer()[0..cap / 2],
-            &content[cap..cap + cap / 2]
+            &file.buffered_writer.inspect_maybe_flushed().unwrap()[0..cap],
+            &content[cap..cap * 2]
+        );
+        assert_eq!(
+            &file.buffered_writer.inspect_mutable()[0..cap / 2],
+            &content[cap * 2..cap * 2 + cap / 2]
         );
     }
 
@@ -422,19 +498,19 @@ mod tests {
 
         let gate = utils::sync::gate::Gate::default();
 
-        let mut file =
-            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-                .await
-                .unwrap();
-
-        let cap = file.buffered_writer.inspect_buffer().capacity();
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
+            .await
+            .unwrap();
 
+        let mutable = file.buffered_writer.inspect_mutable();
+        let cap = mutable.capacity();
+        let align = mutable.align();
         let content: Vec<u8> = rand::thread_rng()
             .sample_iter(rand::distributions::Standard)
-            .take(cap + cap / 2)
+            .take(cap * 2 + cap / 2)
             .collect();
 
-        file.write_raw(&content, &ctx).await.unwrap();
+        let (_, control) = file.write_raw_controlled(&content, &ctx).await.unwrap();
 
         let test_read = |start: usize, len: usize| {
             let file = &file;
@@ -454,16 +530,38 @@ mod tests {
             }
         };
 
+        let test_read_all_offset_combinations = || {
+            async move {
+                test_read(align, align).await;
+                // border onto edge of file
+                test_read(cap - align, align).await;
+                // read across file and buffer
+                test_read(cap - align, 2 * align).await;
+                // stay from start of maybe flushed buffer
+                test_read(cap, align).await;
+                // completely within maybe flushed buffer
+                test_read(cap + align, align).await;
+                // border onto edge of maybe flushed buffer.
+                test_read(cap * 2 - align, align).await;
+                // read across maybe flushed and mutable buffer
+                test_read(cap * 2 - align, 2 * align).await;
+                // read across three segments
+                test_read(cap - align, cap + 2 * align).await;
+                // completely within mutable buffer
+                test_read(cap * 2 + align, align).await;
+            }
+        };
+
         // completely within the file range
-        assert!(20 < cap, "test assumption");
-        test_read(10, 10).await;
-        // border onto edge of file
-        test_read(cap - 10, 10).await;
-        // read across file and buffer
-        test_read(cap - 10, 20).await;
-        // stay from start of buffer
-        test_read(cap, 10).await;
-        // completely within buffer
-        test_read(cap + 10, 10).await;
+        assert!(align < cap, "test assumption");
+        assert!(cap % align == 0);
+
+        // test reads at different flush stages.
+        let not_started = control.unwrap().into_not_started();
+        test_read_all_offset_combinations().await;
+        let in_progress = not_started.ready_to_flush();
+        test_read_all_offset_combinations().await;
+        in_progress.wait_until_flush_is_done().await;
+        test_read_all_offset_combinations().await;
     }
 }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 4bb1bbf3cf..89b935947d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -681,6 +681,7 @@ impl RemoteTimelineClient {
         layer_file_name: &LayerName,
         layer_metadata: &LayerFileMetadata,
         local_path: &Utf8Path,
+        gate: &utils::sync::gate::Gate,
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> Result<u64, DownloadError> {
@@ -700,6 +701,7 @@ impl RemoteTimelineClient {
                 layer_file_name,
                 layer_metadata,
                 local_path,
+                gate,
                 cancel,
                 ctx,
             )
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 739615be9c..c5ae466f3a 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -6,6 +6,7 @@
 use std::collections::HashSet;
 use std::future::Future;
 use std::str::FromStr;
+use std::sync::Arc;
 use std::time::SystemTime;
 
 use anyhow::{anyhow, Context};
@@ -26,9 +27,7 @@ use crate::span::{
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
-#[cfg_attr(target_os = "macos", allow(unused_imports))]
-use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
-use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
+use crate::virtual_file::{on_fatal_io_error, IoBufferMut, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{
     DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath,
@@ -60,6 +59,7 @@ pub async fn download_layer_file<'a>(
     layer_file_name: &'a LayerName,
     layer_metadata: &'a LayerFileMetadata,
     local_path: &Utf8Path,
+    gate: &utils::sync::gate::Gate,
     cancel: &CancellationToken,
     ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
@@ -88,7 +88,9 @@ pub async fn download_layer_file<'a>(
     let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);
 
     let bytes_amount = download_retry(
-        || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
+        || async {
+            download_object(storage, &remote_path, &temp_file_path, gate, cancel, ctx).await
+        },
         &format!("download {remote_path:?}"),
         cancel,
     )
@@ -148,6 +150,7 @@ async fn download_object<'a>(
     storage: &'a GenericRemoteStorage,
     src_path: &RemotePath,
     dst_path: &Utf8PathBuf,
+    gate: &utils::sync::gate::Gate,
     cancel: &CancellationToken,
     #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
@@ -205,13 +208,16 @@ async fn download_object<'a>(
         }
         #[cfg(target_os = "linux")]
         crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
-            use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
-            use bytes::BytesMut;
+            use crate::virtual_file::owned_buffers_io;
             async {
-                let destination_file = VirtualFile::create(dst_path, ctx)
-                    .await
-                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
-                    .map_err(DownloadError::Other)?;
+                let destination_file = Arc::new(
+                    VirtualFile::create(dst_path, ctx)
+                        .await
+                        .with_context(|| {
+                            format!("create a destination file for layer '{dst_path}'")
+                        })
+                        .map_err(DownloadError::Other)?,
+                );
 
                 let mut download = storage
                     .download(src_path, &DownloadOpts::default(), cancel)
@@ -219,14 +225,16 @@ async fn download_object<'a>(
 
                 pausable_failpoint!("before-downloading-layer-stream-pausable");
 
+                let mut buffered = owned_buffers_io::write::BufferedWriter::<IoBufferMut, _>::new(
+                    destination_file,
+                    || IoBufferMut::with_capacity(super::BUFFER_SIZE),
+                    gate.enter().map_err(|_| DownloadError::Cancelled)?,
+                    ctx,
+                );
+
                 // TODO: use vectored write (writev) once supported by tokio-epoll-uring.
                 // There's chunks_vectored() on the stream.
                 let (bytes_amount, destination_file) = async {
-                    let size_tracking = size_tracking_writer::Writer::new(destination_file);
-                    let mut buffered = owned_buffers_io::write::BufferedWriter::<BytesMut, _>::new(
-                        size_tracking,
-                        BytesMut::with_capacity(super::BUFFER_SIZE),
-                    );
                     while let Some(res) =
                         futures::StreamExt::next(&mut download.download_stream).await
                     {
@@ -234,10 +242,10 @@ async fn download_object<'a>(
                             Ok(chunk) => chunk,
                             Err(e) => return Err(e),
                         };
-                        buffered.write_buffered(chunk.slice_len(), ctx).await?;
+                        buffered.write_buffered_borrowed(&chunk, ctx).await?;
                     }
-                    let size_tracking = buffered.flush_and_into_inner(ctx).await?;
-                    Ok(size_tracking.into_inner())
+                    let inner = buffered.flush_and_into_inner(ctx).await?;
+                    Ok(inner)
                 }
                 .await?;
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 701e4cf04b..395e34e404 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -1183,6 +1183,7 @@ impl<'a> TenantDownloader<'a> {
             &layer.name,
             &layer.metadata,
             &local_path,
+            &self.secondary_state.gate,
             &self.secondary_state.cancel,
             ctx,
         )
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index af6112d535..71e53da20f 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -555,13 +555,12 @@ impl InMemoryLayer {
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         start_lsn: Lsn,
-        gate_guard: utils::sync::gate::GateGuard,
+        gate: &utils::sync::gate::Gate,
         ctx: &RequestContext,
     ) -> Result<InMemoryLayer> {
         trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
 
-        let file =
-            EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
+        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, ctx).await?;
         let key = InMemoryLayerFileId(file.page_cache_file_id());
 
         Ok(InMemoryLayer {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index a9f1189b41..8933e8ceb1 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1149,6 +1149,7 @@ impl LayerInner {
                 &self.desc.layer_name(),
                 &self.metadata(),
                 &self.path,
+                &timeline.gate,
                 &timeline.cancel,
                 ctx,
             )
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1414bef0a5..fc741826ab 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3455,7 +3455,6 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<InMemoryLayer>> {
         let mut guard = self.layers.write().await;
-        let gate_guard = self.gate.enter().context("enter gate for inmem layer")?;
 
         let last_record_lsn = self.get_last_record_lsn();
         ensure!(
@@ -3472,7 +3471,7 @@ impl Timeline {
                 self.conf,
                 self.timeline_id,
                 self.tenant_shard_id,
-                gate_guard,
+                &self.gate,
                 ctx,
             )
             .await?;
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 4293a44dca..3888e7f86a 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -182,7 +182,7 @@ impl OpenLayerManager {
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
-        gate_guard: utils::sync::gate::GateGuard,
+        gate: &utils::sync::gate::Gate,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<InMemoryLayer>> {
         ensure!(lsn.is_aligned());
@@ -212,15 +212,9 @@ impl OpenLayerManager {
                 lsn
             );
 
-            let new_layer = InMemoryLayer::create(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                start_lsn,
-                gate_guard,
-                ctx,
-            )
-            .await?;
+            let new_layer =
+                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, gate, ctx)
+                    .await?;
             let layer = Arc::new(new_layer);
 
             self.layer_map.open_layer = Some(layer.clone());
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index b9f8c7ea20..8a7f4a4bf5 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -20,7 +20,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer;
 use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlign};
-use owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
+use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut};
 use owned_buffers_io::io_buf_ext::FullSlice;
 use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver_api::shard::TenantShardId;
@@ -63,9 +63,6 @@ pub(crate) mod owned_buffers_io {
     pub(crate) mod io_buf_ext;
     pub(crate) mod slice;
     pub(crate) mod write;
-    pub(crate) mod util {
-        pub(crate) mod size_tracking_writer;
-    }
 }
 
 #[derive(Debug)]
@@ -221,7 +218,7 @@ impl VirtualFile {
         self.inner.read_exact_at_page(page, offset, ctx).await
     }
 
-    pub async fn write_all_at<Buf: IoBuf + Send>(
+    pub async fn write_all_at<Buf: IoBufAligned + Send>(
         &self,
         buf: FullSlice<Buf>,
         offset: u64,
@@ -1325,14 +1322,14 @@ impl Drop for VirtualFileInner {
 }
 
 impl OwnedAsyncWriter for VirtualFile {
-    #[inline(always)]
-    async fn write_all<Buf: IoBuf + Send>(
-        &mut self,
+    async fn write_all_at<Buf: IoBufAligned + Send>(
+        &self,
         buf: FullSlice<Buf>,
+        offset: u64,
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-        let (buf, res) = VirtualFile::write_all(self, buf, ctx).await;
-        res.map(move |v| (v, buf))
+    ) -> std::io::Result<FullSlice<Buf>> {
+        let (buf, res) = VirtualFile::write_all_at(self, buf, offset, ctx).await;
+        res.map(|_| buf)
     }
 }
 
@@ -1451,7 +1448,7 @@ mod tests {
                 }
             }
         }
-        async fn write_all_at<Buf: IoBuf + Send>(
+        async fn write_all_at<Buf: IoBufAligned + Send>(
             &self,
             buf: FullSlice<Buf>,
             offset: u64,
@@ -1594,6 +1591,7 @@ mod tests {
             &ctx,
         )
         .await?;
+
         file_a
             .write_all(b"foobar".to_vec().slice_len(), &ctx)
             .await?;
@@ -1652,10 +1650,10 @@ mod tests {
         )
         .await?;
         file_b
-            .write_all_at(b"BAR".to_vec().slice_len(), 3, &ctx)
+            .write_all_at(IoBuffer::from(b"BAR").slice_len(), 3, &ctx)
             .await?;
         file_b
-            .write_all_at(b"FOO".to_vec().slice_len(), 0, &ctx)
+            .write_all_at(IoBuffer::from(b"FOO").slice_len(), 0, &ctx)
             .await?;
 
         assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA");
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs
index 933b78a13b..6b9992643f 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs
@@ -4,7 +4,7 @@ pub trait Alignment: std::marker::Unpin + 'static {
 }
 
 /// Alignment at compile time.
-#[derive(Debug)]
+#[derive(Debug, Clone, Copy)]
 pub struct ConstAlign<const A: usize>;
 
 impl<const A: usize> Alignment for ConstAlign<A> {
@@ -14,7 +14,7 @@ impl<const A: usize> Alignment for ConstAlign<A> {
 }
 
 /// Alignment at run time.
-#[derive(Debug)]
+#[derive(Debug, Clone, Copy)]
 pub struct RuntimeAlign {
     align: usize,
 }
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
index 2fba6d699b..a5c26cd746 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
@@ -3,9 +3,10 @@ use std::{
     sync::Arc,
 };
 
-use super::{alignment::Alignment, raw::RawAlignedBuffer};
+use super::{alignment::Alignment, raw::RawAlignedBuffer, AlignedBufferMut, ConstAlign};
 
 /// An shared, immutable aligned buffer type.
+#[derive(Clone, Debug)]
 pub struct AlignedBuffer<A: Alignment> {
     /// Shared raw buffer.
     raw: Arc<RawAlignedBuffer<A>>,
@@ -86,6 +87,13 @@ impl<A: Alignment> AlignedBuffer<A> {
             range: begin..end,
         }
     }
+
+    /// Returns the mutable aligned buffer, if the immutable aligned buffer
+    /// has exactly one strong reference. Otherwise returns `None`.
+    pub fn into_mut(self) -> Option<AlignedBufferMut<A>> {
+        let raw = Arc::into_inner(self.raw)?;
+        Some(AlignedBufferMut::from_raw(raw))
+    }
 }
 
 impl<A: Alignment> Deref for AlignedBuffer<A> {
@@ -108,6 +116,14 @@ impl<A: Alignment> PartialEq<[u8]> for AlignedBuffer<A> {
     }
 }
 
+impl<const A: usize, const N: usize> From<&[u8; N]> for AlignedBuffer<ConstAlign<A>> {
+    fn from(value: &[u8; N]) -> Self {
+        let mut buf = AlignedBufferMut::with_capacity(N);
+        buf.extend_from_slice(value);
+        buf.freeze()
+    }
+}
+
 /// SAFETY: the underlying buffer references a stable memory region.
 unsafe impl<A: Alignment> tokio_epoll_uring::IoBuf for AlignedBuffer<A> {
     fn stable_ptr(&self) -> *const u8 {
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
index b3675d1aea..d2f5e206bb 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
@@ -1,4 +1,7 @@
-use std::ops::{Deref, DerefMut};
+use std::{
+    mem::MaybeUninit,
+    ops::{Deref, DerefMut},
+};
 
 use super::{
     alignment::{Alignment, ConstAlign},
@@ -46,6 +49,11 @@ impl<const A: usize> AlignedBufferMut<ConstAlign<A>> {
 }
 
 impl<A: Alignment> AlignedBufferMut<A> {
+    /// Constructs a mutable aligned buffer from raw.
+    pub(super) fn from_raw(raw: RawAlignedBuffer<A>) -> Self {
+        AlignedBufferMut { raw }
+    }
+
     /// Returns the total number of bytes the buffer can hold.
     #[inline]
     pub fn capacity(&self) -> usize {
@@ -128,6 +136,39 @@ impl<A: Alignment> AlignedBufferMut<A> {
         let len = self.len();
         AlignedBuffer::from_raw(self.raw, 0..len)
     }
+
+    /// Clones and appends all elements in a slice to the buffer. Reserves additional capacity as needed.
+    #[inline]
+    pub fn extend_from_slice(&mut self, extend: &[u8]) {
+        let cnt = extend.len();
+        self.reserve(cnt);
+
+        // SAFETY: we already reserved additional `cnt` bytes, safe to perform memcpy.
+        unsafe {
+            let dst = self.spare_capacity_mut();
+            // Reserved above
+            debug_assert!(dst.len() >= cnt);
+
+            core::ptr::copy_nonoverlapping(extend.as_ptr(), dst.as_mut_ptr().cast(), cnt);
+        }
+        // SAFETY: We do have at least `cnt` bytes remaining before advance.
+        unsafe {
+            bytes::BufMut::advance_mut(self, cnt);
+        }
+    }
+
+    /// Returns the remaining spare capacity of the vector as a slice of `MaybeUninit<u8>`.
+    #[inline]
+    fn spare_capacity_mut(&mut self) -> &mut [MaybeUninit<u8>] {
+        // SAFETY: we guarantees that the `Self::capacity()` bytes from
+        // `Self::as_mut_ptr()` are allocated.
+        unsafe {
+            let ptr = self.as_mut_ptr().add(self.len());
+            let len = self.capacity() - self.len();
+
+            core::slice::from_raw_parts_mut(ptr.cast(), len)
+        }
+    }
 }
 
 impl<A: Alignment> Deref for AlignedBufferMut<A> {
diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
index dba695196e..4ea6b17744 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
@@ -1,9 +1,15 @@
-use tokio_epoll_uring::IoBufMut;
+use tokio_epoll_uring::{IoBuf, IoBufMut};
 
-use crate::virtual_file::{IoBufferMut, PageWriteGuardBuf};
+use crate::virtual_file::{IoBuffer, IoBufferMut, PageWriteGuardBuf};
 
+/// A marker trait for a mutable aligned buffer type.
 pub trait IoBufAlignedMut: IoBufMut {}
 
+/// A marker trait for an aligned buffer type.
+pub trait IoBufAligned: IoBuf {}
+
 impl IoBufAlignedMut for IoBufferMut {}
 
+impl IoBufAligned for IoBuffer {}
+
 impl IoBufAlignedMut for PageWriteGuardBuf {}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
index c3940cf6ce..525f447b6d 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
@@ -5,6 +5,8 @@ use bytes::{Bytes, BytesMut};
 use std::ops::{Deref, Range};
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
 
+use super::write::CheapCloneForRead;
+
 /// The true owned equivalent for Rust [`slice`]. Use this for the write path.
 ///
 /// Unlike [`tokio_epoll_uring::Slice`], which we unfortunately inherited from `tokio-uring`,
@@ -43,6 +45,18 @@ where
     }
 }
 
+impl<B> CheapCloneForRead for FullSlice<B>
+where
+    B: IoBuf + CheapCloneForRead,
+{
+    fn cheap_clone(&self) -> Self {
+        let bounds = self.slice.bounds();
+        let clone = self.slice.get_ref().cheap_clone();
+        let slice = clone.slice(bounds);
+        Self { slice }
+    }
+}
+
 pub(crate) trait IoBufExt {
     /// Get a [`FullSlice`] for the entire buffer, i.e., `self[..]` or `self[0..self.len()]`.
     fn slice_len(self) -> FullSlice<Self>
diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
deleted file mode 100644
index efcb61ba65..0000000000
--- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
+++ /dev/null
@@ -1,50 +0,0 @@
-use crate::{
-    context::RequestContext,
-    virtual_file::owned_buffers_io::{io_buf_ext::FullSlice, write::OwnedAsyncWriter},
-};
-use tokio_epoll_uring::IoBuf;
-
-pub struct Writer<W> {
-    dst: W,
-    bytes_amount: u64,
-}
-
-impl<W> Writer<W> {
-    pub fn new(dst: W) -> Self {
-        Self {
-            dst,
-            bytes_amount: 0,
-        }
-    }
-
-    pub fn bytes_written(&self) -> u64 {
-        self.bytes_amount
-    }
-
-    pub fn as_inner(&self) -> &W {
-        &self.dst
-    }
-
-    /// Returns the wrapped `VirtualFile` object as well as the number
-    /// of bytes that were written to it through this object.
-    #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub fn into_inner(self) -> (u64, W) {
-        (self.bytes_amount, self.dst)
-    }
-}
-
-impl<W> OwnedAsyncWriter for Writer<W>
-where
-    W: OwnedAsyncWriter,
-{
-    #[inline(always)]
-    async fn write_all<Buf: IoBuf + Send>(
-        &mut self,
-        buf: FullSlice<Buf>,
-        ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-        let (nwritten, buf) = self.dst.write_all(buf, ctx).await?;
-        self.bytes_amount += u64::try_from(nwritten).unwrap();
-        Ok((nwritten, buf))
-    }
-}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index 568cf62e56..20bf878123 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -1,55 +1,88 @@
-use bytes::BytesMut;
+mod flush;
+use std::sync::Arc;
+
+use flush::FlushHandle;
 use tokio_epoll_uring::IoBuf;
 
-use crate::context::RequestContext;
+use crate::{
+    context::RequestContext,
+    virtual_file::{IoBuffer, IoBufferMut},
+};
 
-use super::io_buf_ext::{FullSlice, IoBufExt};
+use super::{
+    io_buf_aligned::IoBufAligned,
+    io_buf_ext::{FullSlice, IoBufExt},
+};
+
+pub(crate) use flush::FlushControl;
+
+pub(crate) trait CheapCloneForRead {
+    /// Returns a cheap clone of the buffer.
+    fn cheap_clone(&self) -> Self;
+}
+
+impl CheapCloneForRead for IoBuffer {
+    fn cheap_clone(&self) -> Self {
+        // Cheap clone over an `Arc`.
+        self.clone()
+    }
+}
 
 /// A trait for doing owned-buffer write IO.
 /// Think [`tokio::io::AsyncWrite`] but with owned buffers.
+/// The owned buffers need to be aligned due to Direct IO requirements.
 pub trait OwnedAsyncWriter {
-    async fn write_all<Buf: IoBuf + Send>(
-        &mut self,
+    fn write_all_at<Buf: IoBufAligned + Send>(
+        &self,
         buf: FullSlice<Buf>,
+        offset: u64,
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)>;
+    ) -> impl std::future::Future<Output = std::io::Result<FullSlice<Buf>>> + Send;
 }
 
 /// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch
 /// small writes into larger writes of size [`Buffer::cap`].
-///
-/// # Passthrough Of Large Writers
-///
-/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`]
-/// cause the internal buffer to be flushed prematurely so that the large
-/// buffered write is passed through to the underlying [`OwnedAsyncWriter`].
-///
-/// This pass-through is generally beneficial for throughput, but if
-/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
-/// unlimited large writes may cause latency or fairness issues.
-///
-/// In such cases, a different implementation that always buffers in memory
-/// may be preferable.
-pub struct BufferedWriter<B, W> {
-    writer: W,
+// TODO(yuchen): For large write, implementing buffer bypass for aligned parts of the write could be beneficial to throughput,
+// since we would avoid copying majority of the data into the internal buffer.
+pub struct BufferedWriter<B: Buffer, W> {
+    writer: Arc<W>,
     /// invariant: always remains Some(buf) except
     /// - while IO is ongoing => goes back to Some() once the IO completed successfully
     /// - after an IO error => stays `None` forever
     ///
     /// In these exceptional cases, it's `None`.
-    buf: Option<B>,
+    mutable: Option<B>,
+    /// A handle to the background flush task for writting data to disk.
+    flush_handle: FlushHandle<B::IoBuf, W>,
+    /// The number of bytes submitted to the background task.
+    bytes_submitted: u64,
 }
 
 impl<B, Buf, W> BufferedWriter<B, W>
 where
-    B: Buffer<IoBuf = Buf> + Send,
-    Buf: IoBuf + Send,
-    W: OwnedAsyncWriter,
+    B: Buffer<IoBuf = Buf> + Send + 'static,
+    Buf: IoBufAligned + Send + Sync + CheapCloneForRead,
+    W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug,
 {
-    pub fn new(writer: W, buf: B) -> Self {
+    /// Creates a new buffered writer.
+    ///
+    /// The `buf_new` function provides a way to initialize the owned buffers used by this writer.
+    pub fn new(
+        writer: Arc<W>,
+        buf_new: impl Fn() -> B,
+        gate_guard: utils::sync::gate::GateGuard,
+        ctx: &RequestContext,
+    ) -> Self {
         Self {
-            writer,
-            buf: Some(buf),
+            writer: writer.clone(),
+            mutable: Some(buf_new()),
+            flush_handle: FlushHandle::spawn_new(
+                writer,
+                buf_new(),
+                gate_guard,
+                ctx.attached_child(),
+            ),
+            bytes_submitted: 0,
         }
     }
 
@@ -57,87 +90,70 @@ where
         &self.writer
     }
 
+    /// Returns the number of bytes submitted to the background flush task.
+    pub fn bytes_submitted(&self) -> u64 {
+        self.bytes_submitted
+    }
+
     /// Panics if used after any of the write paths returned an error
-    pub fn inspect_buffer(&self) -> &B {
-        self.buf()
+    pub fn inspect_mutable(&self) -> &B {
+        self.mutable()
+    }
+
+    /// Gets a reference to the maybe flushed read-only buffer.
+    /// Returns `None` if the writer has not submitted any flush request.
+    pub fn inspect_maybe_flushed(&self) -> Option<&FullSlice<Buf>> {
+        self.flush_handle.maybe_flushed.as_ref()
     }
 
     #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result<W> {
+    pub async fn flush_and_into_inner(
+        mut self,
+        ctx: &RequestContext,
+    ) -> std::io::Result<(u64, Arc<W>)> {
         self.flush(ctx).await?;
 
-        let Self { buf, writer } = self;
+        let Self {
+            mutable: buf,
+            writer,
+            mut flush_handle,
+            bytes_submitted: bytes_amount,
+        } = self;
+        flush_handle.shutdown().await?;
         assert!(buf.is_some());
-        Ok(writer)
+        Ok((bytes_amount, writer))
     }
 
+    /// Gets a reference to the mutable in-memory buffer.
     #[inline(always)]
-    fn buf(&self) -> &B {
-        self.buf
+    fn mutable(&self) -> &B {
+        self.mutable
             .as_ref()
             .expect("must not use after we returned an error")
     }
 
-    /// Guarantees that if Ok() is returned, all bytes in `chunk` have been accepted.
-    #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub async fn write_buffered<S: IoBuf + Send>(
+    pub async fn write_buffered_borrowed(
         &mut self,
-        chunk: FullSlice<S>,
+        chunk: &[u8],
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<S>)> {
-        let chunk = chunk.into_raw_slice();
-
-        let chunk_len = chunk.len();
-        // avoid memcpy for the middle of the chunk
-        if chunk.len() >= self.buf().cap() {
-            self.flush(ctx).await?;
-            // do a big write, bypassing `buf`
-            assert_eq!(
-                self.buf
-                    .as_ref()
-                    .expect("must not use after an error")
-                    .pending(),
-                0
-            );
-            let (nwritten, chunk) = self
-                .writer
-                .write_all(FullSlice::must_new(chunk), ctx)
-                .await?;
-            assert_eq!(nwritten, chunk_len);
-            return Ok((nwritten, chunk));
+    ) -> std::io::Result<usize> {
+        let (len, control) = self.write_buffered_borrowed_controlled(chunk, ctx).await?;
+        if let Some(control) = control {
+            control.release().await;
         }
-        // in-memory copy the < BUFFER_SIZED tail of the chunk
-        assert!(chunk.len() < self.buf().cap());
-        let mut slice = &chunk[..];
-        while !slice.is_empty() {
-            let buf = self.buf.as_mut().expect("must not use after an error");
-            let need = buf.cap() - buf.pending();
-            let have = slice.len();
-            let n = std::cmp::min(need, have);
-            buf.extend_from_slice(&slice[..n]);
-            slice = &slice[n..];
-            if buf.pending() >= buf.cap() {
-                assert_eq!(buf.pending(), buf.cap());
-                self.flush(ctx).await?;
-            }
-        }
-        assert!(slice.is_empty(), "by now we should have drained the chunk");
-        Ok((chunk_len, FullSlice::must_new(chunk)))
+        Ok(len)
     }
 
-    /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data.
-    ///
-    /// It is less performant because we always have to copy the borrowed data into the internal buffer
-    /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant
-    /// for large writes.
-    pub async fn write_buffered_borrowed(
+    /// In addition to bytes submitted in this write, also returns a handle that can control the flush behavior.
+    pub(crate) async fn write_buffered_borrowed_controlled(
         &mut self,
         mut chunk: &[u8],
         ctx: &RequestContext,
-    ) -> std::io::Result<usize> {
+    ) -> std::io::Result<(usize, Option<FlushControl>)> {
         let chunk_len = chunk.len();
+        let mut control: Option<FlushControl> = None;
         while !chunk.is_empty() {
-            let buf = self.buf.as_mut().expect("must not use after an error");
+            let buf = self.mutable.as_mut().expect("must not use after an error");
             let need = buf.cap() - buf.pending();
             let have = chunk.len();
             let n = std::cmp::min(need, have);
@@ -145,26 +161,27 @@ where
             chunk = &chunk[n..];
             if buf.pending() >= buf.cap() {
                 assert_eq!(buf.pending(), buf.cap());
-                self.flush(ctx).await?;
+                if let Some(control) = control.take() {
+                    control.release().await;
+                }
+                control = self.flush(ctx).await?;
             }
         }
-        Ok(chunk_len)
+        Ok((chunk_len, control))
     }
 
-    async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> {
-        let buf = self.buf.take().expect("must not use after an error");
+    #[must_use = "caller must explcitly check the flush control"]
+    async fn flush(&mut self, _ctx: &RequestContext) -> std::io::Result<Option<FlushControl>> {
+        let buf = self.mutable.take().expect("must not use after an error");
         let buf_len = buf.pending();
         if buf_len == 0 {
-            self.buf = Some(buf);
-            return Ok(());
+            self.mutable = Some(buf);
+            return Ok(None);
         }
-        let slice = buf.flush();
-        let (nwritten, slice) = self.writer.write_all(slice, ctx).await?;
-        assert_eq!(nwritten, buf_len);
-        self.buf = Some(Buffer::reuse_after_flush(
-            slice.into_raw_slice().into_inner(),
-        ));
-        Ok(())
+        let (recycled, flush_control) = self.flush_handle.flush(buf, self.bytes_submitted).await?;
+        self.bytes_submitted += u64::try_from(buf_len).unwrap();
+        self.mutable = Some(recycled);
+        Ok(Some(flush_control))
     }
 }
 
@@ -192,64 +209,77 @@ pub trait Buffer {
     fn reuse_after_flush(iobuf: Self::IoBuf) -> Self;
 }
 
-impl Buffer for BytesMut {
-    type IoBuf = BytesMut;
+impl Buffer for IoBufferMut {
+    type IoBuf = IoBuffer;
 
-    #[inline(always)]
     fn cap(&self) -> usize {
         self.capacity()
     }
 
     fn extend_from_slice(&mut self, other: &[u8]) {
-        BytesMut::extend_from_slice(self, other)
+        if self.len() + other.len() > self.cap() {
+            panic!("Buffer capacity exceeded");
+        }
+
+        IoBufferMut::extend_from_slice(self, other);
     }
 
-    #[inline(always)]
     fn pending(&self) -> usize {
         self.len()
     }
 
-    fn flush(self) -> FullSlice<BytesMut> {
-        self.slice_len()
+    fn flush(self) -> FullSlice<Self::IoBuf> {
+        self.freeze().slice_len()
     }
 
-    fn reuse_after_flush(mut iobuf: BytesMut) -> Self {
-        iobuf.clear();
-        iobuf
-    }
-}
-
-impl OwnedAsyncWriter for Vec<u8> {
-    async fn write_all<Buf: IoBuf + Send>(
-        &mut self,
-        buf: FullSlice<Buf>,
-        _: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-        self.extend_from_slice(&buf[..]);
-        Ok((buf.len(), buf))
+    /// Caller should make sure that `iobuf` only have one strong reference before invoking this method.
+    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
+        let mut recycled = iobuf
+            .into_mut()
+            .expect("buffer should only have one strong reference");
+        recycled.clear();
+        recycled
     }
 }
 
 #[cfg(test)]
 mod tests {
-    use bytes::BytesMut;
+    use std::sync::Mutex;
 
     use super::*;
     use crate::context::{DownloadBehavior, RequestContext};
     use crate::task_mgr::TaskKind;
 
-    #[derive(Default)]
+    #[derive(Default, Debug)]
     struct RecorderWriter {
-        writes: Vec<Vec<u8>>,
+        /// record bytes and write offsets.
+        writes: Mutex<Vec<(Vec<u8>, u64)>>,
     }
+
+    impl RecorderWriter {
+        /// Gets recorded bytes and write offsets.
+        fn get_writes(&self) -> Vec<Vec<u8>> {
+            self.writes
+                .lock()
+                .unwrap()
+                .iter()
+                .map(|(buf, _)| buf.clone())
+                .collect()
+        }
+    }
+
     impl OwnedAsyncWriter for RecorderWriter {
-        async fn write_all<Buf: IoBuf + Send>(
-            &mut self,
+        async fn write_all_at<Buf: IoBufAligned + Send>(
+            &self,
             buf: FullSlice<Buf>,
+            offset: u64,
             _: &RequestContext,
-        ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-            self.writes.push(Vec::from(&buf[..]));
-            Ok((buf.len(), buf))
+        ) -> std::io::Result<FullSlice<Buf>> {
+            self.writes
+                .lock()
+                .unwrap()
+                .push((Vec::from(&buf[..]), offset));
+            Ok(buf)
         }
     }
 
@@ -257,71 +287,21 @@ mod tests {
         RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
     }
 
-    macro_rules! write {
-        ($writer:ident, $data:literal) => {{
-            $writer
-                .write_buffered(::bytes::Bytes::from_static($data).slice_len(), &test_ctx())
-                .await?;
-        }};
-    }
-
     #[tokio::test]
-    async fn test_buffered_writes_only() -> std::io::Result<()> {
-        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
-        write!(writer, b"a");
-        write!(writer, b"b");
-        write!(writer, b"c");
-        write!(writer, b"d");
-        write!(writer, b"e");
-        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
-        assert_eq!(
-            recorder.writes,
-            vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")]
-        );
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_passthrough_writes_only() -> std::io::Result<()> {
-        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
-        write!(writer, b"abc");
-        write!(writer, b"de");
-        write!(writer, b"");
-        write!(writer, b"fghijk");
-        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
-        assert_eq!(
-            recorder.writes,
-            vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")]
-        );
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
-        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
-        write!(writer, b"a");
-        write!(writer, b"bc");
-        write!(writer, b"d");
-        write!(writer, b"e");
-        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
-        assert_eq!(
-            recorder.writes,
-            vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")]
-        );
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
+    async fn test_write_all_borrowed_always_goes_through_buffer() -> anyhow::Result<()> {
         let ctx = test_ctx();
         let ctx = &ctx;
-        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
+        let recorder = Arc::new(RecorderWriter::default());
+        let gate = utils::sync::gate::Gate::default();
+        let mut writer = BufferedWriter::<_, RecorderWriter>::new(
+            recorder,
+            || IoBufferMut::with_capacity(2),
+            gate.enter()?,
+            ctx,
+        );
 
         writer.write_buffered_borrowed(b"abc", ctx).await?;
+        writer.write_buffered_borrowed(b"", ctx).await?;
         writer.write_buffered_borrowed(b"d", ctx).await?;
         writer.write_buffered_borrowed(b"e", ctx).await?;
         writer.write_buffered_borrowed(b"fg", ctx).await?;
@@ -329,9 +309,9 @@ mod tests {
         writer.write_buffered_borrowed(b"j", ctx).await?;
         writer.write_buffered_borrowed(b"klmno", ctx).await?;
 
-        let recorder = writer.flush_and_into_inner(ctx).await?;
+        let (_, recorder) = writer.flush_and_into_inner(ctx).await?;
         assert_eq!(
-            recorder.writes,
+            recorder.get_writes(),
             {
                 let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"];
                 expect
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
new file mode 100644
index 0000000000..9ce8b311bb
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
@@ -0,0 +1,314 @@
+use std::sync::Arc;
+
+use utils::sync::duplex;
+
+use crate::{
+    context::RequestContext,
+    virtual_file::owned_buffers_io::{io_buf_aligned::IoBufAligned, io_buf_ext::FullSlice},
+};
+
+use super::{Buffer, CheapCloneForRead, OwnedAsyncWriter};
+
+/// A handle to the flush task.
+pub struct FlushHandle<Buf, W> {
+    inner: Option<FlushHandleInner<Buf, W>>,
+    /// Immutable buffer for serving tail reads.
+    /// `None` if no flush request has been submitted.
+    pub(super) maybe_flushed: Option<FullSlice<Buf>>,
+}
+
+pub struct FlushHandleInner<Buf, W> {
+    /// A bi-directional channel that sends (buffer, offset) for writes,
+    /// and receives recyled buffer.
+    channel: duplex::mpsc::Duplex<FlushRequest<Buf>, FullSlice<Buf>>,
+    /// Join handle for the background flush task.
+    join_handle: tokio::task::JoinHandle<std::io::Result<Arc<W>>>,
+}
+
+struct FlushRequest<Buf> {
+    slice: FullSlice<Buf>,
+    offset: u64,
+    #[cfg(test)]
+    ready_to_flush_rx: tokio::sync::oneshot::Receiver<()>,
+    #[cfg(test)]
+    done_flush_tx: tokio::sync::oneshot::Sender<()>,
+}
+
+/// Constructs a request and a control object for a new flush operation.
+#[cfg(not(test))]
+fn new_flush_op<Buf>(slice: FullSlice<Buf>, offset: u64) -> (FlushRequest<Buf>, FlushControl) {
+    let request = FlushRequest { slice, offset };
+    let control = FlushControl::untracked();
+
+    (request, control)
+}
+
+/// Constructs a request and a control object for a new flush operation.
+#[cfg(test)]
+fn new_flush_op<Buf>(slice: FullSlice<Buf>, offset: u64) -> (FlushRequest<Buf>, FlushControl) {
+    let (ready_to_flush_tx, ready_to_flush_rx) = tokio::sync::oneshot::channel();
+    let (done_flush_tx, done_flush_rx) = tokio::sync::oneshot::channel();
+    let control = FlushControl::not_started(ready_to_flush_tx, done_flush_rx);
+
+    let request = FlushRequest {
+        slice,
+        offset,
+        ready_to_flush_rx,
+        done_flush_tx,
+    };
+    (request, control)
+}
+
+/// A handle to a `FlushRequest` that allows unit tests precise control over flush behavior.
+#[cfg(test)]
+pub(crate) struct FlushControl {
+    not_started: FlushNotStarted,
+}
+
+#[cfg(not(test))]
+pub(crate) struct FlushControl;
+
+impl FlushControl {
+    #[cfg(test)]
+    fn not_started(
+        ready_to_flush_tx: tokio::sync::oneshot::Sender<()>,
+        done_flush_rx: tokio::sync::oneshot::Receiver<()>,
+    ) -> Self {
+        FlushControl {
+            not_started: FlushNotStarted {
+                ready_to_flush_tx,
+                done_flush_rx,
+            },
+        }
+    }
+
+    #[cfg(not(test))]
+    fn untracked() -> Self {
+        FlushControl
+    }
+
+    /// In tests, turn flush control into a not started state.
+    #[cfg(test)]
+    pub(crate) fn into_not_started(self) -> FlushNotStarted {
+        self.not_started
+    }
+
+    /// Release control to the submitted buffer.
+    ///
+    /// In `cfg(test)` environment, the buffer is guranteed to be flushed to disk after [`FlushControl::release`] is finishes execution.
+    pub async fn release(self) {
+        #[cfg(test)]
+        {
+            self.not_started
+                .ready_to_flush()
+                .wait_until_flush_is_done()
+                .await;
+        }
+    }
+}
+
+impl<Buf, W> FlushHandle<Buf, W>
+where
+    Buf: IoBufAligned + Send + Sync + CheapCloneForRead,
+    W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug,
+{
+    /// Spawns a new background flush task and obtains a handle.
+    ///
+    /// Note: The background task so we do not need to explicitly maintain a queue of buffers.
+    pub fn spawn_new<B>(
+        file: Arc<W>,
+        buf: B,
+        gate_guard: utils::sync::gate::GateGuard,
+        ctx: RequestContext,
+    ) -> Self
+    where
+        B: Buffer<IoBuf = Buf> + Send + 'static,
+    {
+        // It is fine to buffer up to only 1 message. We only 1 message in-flight at a time.
+        let (front, back) = duplex::mpsc::channel(1);
+
+        let join_handle = tokio::spawn(async move {
+            FlushBackgroundTask::new(back, file, gate_guard, ctx)
+                .run(buf.flush())
+                .await
+        });
+
+        FlushHandle {
+            inner: Some(FlushHandleInner {
+                channel: front,
+                join_handle,
+            }),
+            maybe_flushed: None,
+        }
+    }
+
+    /// Submits a buffer to be flushed in the background task.
+    /// Returns a buffer that completed flushing for re-use, length reset to 0, capacity unchanged.
+    /// If `save_buf_for_read` is true, then we save the buffer in `Self::maybe_flushed`, otherwise
+    /// clear `maybe_flushed`.
+    pub async fn flush<B>(&mut self, buf: B, offset: u64) -> std::io::Result<(B, FlushControl)>
+    where
+        B: Buffer<IoBuf = Buf> + Send + 'static,
+    {
+        let slice = buf.flush();
+
+        // Saves a buffer for read while flushing. This also removes reference to the old buffer.
+        self.maybe_flushed = Some(slice.cheap_clone());
+
+        let (request, flush_control) = new_flush_op(slice, offset);
+
+        // Submits the buffer to the background task.
+        let submit = self.inner_mut().channel.send(request).await;
+        if submit.is_err() {
+            return self.handle_error().await;
+        }
+
+        // Wait for an available buffer from the background flush task.
+        // This is the BACKPRESSURE mechanism: if the flush task can't keep up,
+        // then the write path will eventually wait for it here.
+        let Some(recycled) = self.inner_mut().channel.recv().await else {
+            return self.handle_error().await;
+        };
+
+        // The only other place that could hold a reference to the recycled buffer
+        // is in `Self::maybe_flushed`, but we have already replace it with the new buffer.
+        let recycled = Buffer::reuse_after_flush(recycled.into_raw_slice().into_inner());
+        Ok((recycled, flush_control))
+    }
+
+    async fn handle_error<T>(&mut self) -> std::io::Result<T> {
+        Err(self
+            .shutdown()
+            .await
+            .expect_err("flush task only disconnects duplex if it exits with an error"))
+    }
+
+    /// Cleans up the channel, join the flush task.
+    pub async fn shutdown(&mut self) -> std::io::Result<Arc<W>> {
+        let handle = self
+            .inner
+            .take()
+            .expect("must not use after we returned an error");
+        drop(handle.channel.tx);
+        handle.join_handle.await.unwrap()
+    }
+
+    /// Gets a mutable reference to the inner handle. Panics if [`Self::inner`] is `None`.
+    /// This only happens if the handle is used after an error.
+    fn inner_mut(&mut self) -> &mut FlushHandleInner<Buf, W> {
+        self.inner
+            .as_mut()
+            .expect("must not use after we returned an error")
+    }
+}
+
+/// A background task for flushing data to disk.
+pub struct FlushBackgroundTask<Buf, W> {
+    /// A bi-directional channel that receives (buffer, offset) for writes,
+    /// and send back recycled buffer.
+    channel: duplex::mpsc::Duplex<FullSlice<Buf>, FlushRequest<Buf>>,
+    /// A writter for persisting data to disk.
+    writer: Arc<W>,
+    ctx: RequestContext,
+    /// Prevent timeline from shuting down until the flush background task finishes flushing all remaining buffers to disk.
+    _gate_guard: utils::sync::gate::GateGuard,
+}
+
+impl<Buf, W> FlushBackgroundTask<Buf, W>
+where
+    Buf: IoBufAligned + Send + Sync,
+    W: OwnedAsyncWriter + Sync + 'static,
+{
+    /// Creates a new background flush task.
+    fn new(
+        channel: duplex::mpsc::Duplex<FullSlice<Buf>, FlushRequest<Buf>>,
+        file: Arc<W>,
+        gate_guard: utils::sync::gate::GateGuard,
+        ctx: RequestContext,
+    ) -> Self {
+        FlushBackgroundTask {
+            channel,
+            writer: file,
+            _gate_guard: gate_guard,
+            ctx,
+        }
+    }
+
+    /// Runs the background flush task.
+    /// The passed in slice is immediately sent back to the flush handle through the duplex channel.
+    async fn run(mut self, slice: FullSlice<Buf>) -> std::io::Result<Arc<W>> {
+        // Sends the extra buffer back to the handle.
+        self.channel.send(slice).await.map_err(|_| {
+            std::io::Error::new(std::io::ErrorKind::BrokenPipe, "flush handle closed early")
+        })?;
+
+        //  Exit condition: channel is closed and there is no remaining buffer to be flushed
+        while let Some(request) = self.channel.recv().await {
+            #[cfg(test)]
+            {
+                // In test, wait for control to signal that we are ready to flush.
+                if request.ready_to_flush_rx.await.is_err() {
+                    tracing::debug!("control dropped");
+                }
+            }
+
+            // Write slice to disk at `offset`.
+            let slice = self
+                .writer
+                .write_all_at(request.slice, request.offset, &self.ctx)
+                .await?;
+
+            #[cfg(test)]
+            {
+                // In test, tell control we are done flushing buffer.
+                if request.done_flush_tx.send(()).is_err() {
+                    tracing::debug!("control dropped");
+                }
+            }
+
+            // Sends the buffer back to the handle for reuse. The handle is in charged of cleaning the buffer.
+            if self.channel.send(slice).await.is_err() {
+                // Although channel is closed. Still need to finish flushing the remaining buffers.
+                continue;
+            }
+        }
+
+        Ok(self.writer)
+    }
+}
+
+#[cfg(test)]
+pub(crate) struct FlushNotStarted {
+    ready_to_flush_tx: tokio::sync::oneshot::Sender<()>,
+    done_flush_rx: tokio::sync::oneshot::Receiver<()>,
+}
+
+#[cfg(test)]
+pub(crate) struct FlushInProgress {
+    done_flush_rx: tokio::sync::oneshot::Receiver<()>,
+}
+
+#[cfg(test)]
+pub(crate) struct FlushDone;
+
+#[cfg(test)]
+impl FlushNotStarted {
+    /// Signals the background task the buffer is ready to flush to disk.
+    pub fn ready_to_flush(self) -> FlushInProgress {
+        self.ready_to_flush_tx
+            .send(())
+            .map(|_| FlushInProgress {
+                done_flush_rx: self.done_flush_rx,
+            })
+            .unwrap()
+    }
+}
+
+#[cfg(test)]
+impl FlushInProgress {
+    /// Waits until background flush is done.
+    pub async fn wait_until_flush_is_done(self) -> FlushDone {
+        self.done_flush_rx.await.unwrap();
+        FlushDone
+    }
+}

From d0289299457dc6d1caece326553af47aa976e80a Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 4 Dec 2024 17:42:17 +0000
Subject: [PATCH 192/209] chore: update clap (#10009)

This updates clap to use a new version of anstream
---
 Cargo.lock | 62 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 38158b7aec..de8785f87e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -84,16 +84,16 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
 
 [[package]]
 name = "anstream"
-version = "0.3.2"
+version = "0.6.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163"
+checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526"
 dependencies = [
  "anstyle",
  "anstyle-parse",
  "anstyle-query",
  "anstyle-wincon",
  "colorchoice",
- "is-terminal",
+ "is_terminal_polyfill",
  "utf8parse",
 ]
 
@@ -123,12 +123,12 @@ dependencies = [
 
 [[package]]
 name = "anstyle-wincon"
-version = "1.0.1"
+version = "3.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188"
+checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8"
 dependencies = [
  "anstyle",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -1167,35 +1167,33 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.3.0"
+version = "4.5.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "93aae7a4192245f70fe75dd9157fc7b4a5bf53e88d30bd4396f7d8f9284d5acc"
+checksum = "69371e34337c4c984bbe322360c2547210bf632eb2814bbe78a6e87a2935bd2b"
 dependencies = [
  "clap_builder",
  "clap_derive",
- "once_cell",
 ]
 
 [[package]]
 name = "clap_builder"
-version = "4.3.0"
+version = "4.5.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f423e341edefb78c9caba2d9c7f7687d0e72e89df3ce3394554754393ac3990"
+checksum = "6e24c1b4099818523236a8ca881d2b45db98dadfb4625cf6608c12069fcbbde1"
 dependencies = [
  "anstream",
  "anstyle",
- "bitflags 1.3.2",
  "clap_lex",
- "strsim",
+ "strsim 0.11.1",
 ]
 
 [[package]]
 name = "clap_derive"
-version = "4.3.0"
+version = "4.5.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b"
+checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
 dependencies = [
- "heck 0.4.1",
+ "heck",
  "proc-macro2",
  "quote",
  "syn 2.0.90",
@@ -1203,9 +1201,9 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.5.0"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
+checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"
 
 [[package]]
 name = "colorchoice"
@@ -1614,7 +1612,7 @@ dependencies = [
  "ident_case",
  "proc-macro2",
  "quote",
- "strsim",
+ "strsim 0.10.0",
  "syn 2.0.90",
 ]
 
@@ -1812,7 +1810,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
 dependencies = [
  "darling",
  "either",
- "heck 0.5.0",
+ "heck",
  "proc-macro2",
  "quote",
  "syn 2.0.90",
@@ -2465,12 +2463,6 @@ dependencies = [
  "num-traits",
 ]
 
-[[package]]
-name = "heck"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
-
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -2888,6 +2880,12 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
+
 [[package]]
 name = "itertools"
 version = "0.10.5"
@@ -3169,7 +3167,7 @@ version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
 dependencies = [
- "heck 0.5.0",
+ "heck",
  "proc-macro2",
  "quote",
  "syn 2.0.90",
@@ -4458,7 +4456,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
 dependencies = [
  "bytes",
- "heck 0.5.0",
+ "heck",
  "itertools 0.12.1",
  "log",
  "multimap",
@@ -6166,6 +6164,12 @@ version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
 
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
 [[package]]
 name = "strum"
 version = "0.26.3"
@@ -6178,7 +6182,7 @@ version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
 dependencies = [
- "heck 0.5.0",
+ "heck",
  "proc-macro2",
  "quote",
  "rustversion",

From ecdad5e6d5b614812b000acb24a99d5d7d2e56f4 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 4 Dec 2024 21:07:44 +0000
Subject: [PATCH 193/209] chore: update rust-postgres (#10002)

Like #9931 but without rebasing upstream just yet, to try and minimise
the differences.

Removes all proxy-specific commits from the rust-postgres fork, now that
proxy no longer depends on them. Merging upstream changes to come later.
---
 Cargo.lock | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index de8785f87e..62f06d45bd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4169,7 +4169,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4182,7 +4182,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -4195,7 +4195,6 @@ dependencies = [
  "rand 0.8.5",
  "sha2",
  "stringprep",
- "tokio",
 ]
 
 [[package]]
@@ -4217,7 +4216,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -6547,7 +6546,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
 dependencies = [
  "async-trait",
  "byteorder",

From a83bd4e81c82680b4d74add46c6096b3ed2d9ce4 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Wed, 4 Dec 2024 21:16:09 -0500
Subject: [PATCH 194/209] pageserver: fix buffered-writer on macos build
 (#10019)

## Problem

In https://github.com/neondatabase/neon/pull/9693, we forgot to check
macos build. The [CI
run](https://github.com/neondatabase/neon/actions/runs/12164541897/job/33926455468)
on main showed that macos build failed with unused variables and dead
code.

## Summary of changes

- add `allow(dead_code)` and `allow(unused_variables)` to the relevant
code that is not used on macos.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 pageserver/src/tenant/remote_timeline_client/download.rs | 7 ++++---
 pageserver/src/virtual_file/owned_buffers_io/write.rs    | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index c5ae466f3a..d15f161fb6 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -6,7 +6,6 @@
 use std::collections::HashSet;
 use std::future::Future;
 use std::str::FromStr;
-use std::sync::Arc;
 use std::time::SystemTime;
 
 use anyhow::{anyhow, Context};
@@ -27,7 +26,7 @@ use crate::span::{
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
-use crate::virtual_file::{on_fatal_io_error, IoBufferMut, MaybeFatalIo, VirtualFile};
+use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{
     DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath,
@@ -150,7 +149,7 @@ async fn download_object<'a>(
     storage: &'a GenericRemoteStorage,
     src_path: &RemotePath,
     dst_path: &Utf8PathBuf,
-    gate: &utils::sync::gate::Gate,
+    #[cfg_attr(target_os = "macos", allow(unused_variables))] gate: &utils::sync::gate::Gate,
     cancel: &CancellationToken,
     #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
@@ -209,6 +208,8 @@ async fn download_object<'a>(
         #[cfg(target_os = "linux")]
         crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
             use crate::virtual_file::owned_buffers_io;
+            use crate::virtual_file::IoBufferMut;
+            use std::sync::Arc;
             async {
                 let destination_file = Arc::new(
                     VirtualFile::create(dst_path, ctx)
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index 20bf878123..7299d83703 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -132,6 +132,7 @@ where
             .expect("must not use after we returned an error")
     }
 
+    #[cfg_attr(target_os = "macos", allow(dead_code))]
     pub async fn write_buffered_borrowed(
         &mut self,
         chunk: &[u8],

From 0fd211537baed6178d1d6be2928493cb02346c20 Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Thu, 5 Dec 2024 07:30:38 +0200
Subject: [PATCH 195/209] proxy: Present new auth backend cplane_proxy_v1
 (#10012)

Implement a new auth backend based on the current Neon backend to switch
to the new Proxy V1 cplane API.

Implements [#21048](https://github.com/neondatabase/cloud/issues/21048)
---
 proxy/src/auth/backend/mod.rs                 |   4 +
 proxy/src/bin/proxy.rs                        |  99 +++-
 .../control_plane/client/cplane_proxy_v1.rs   | 514 ++++++++++++++++++
 proxy/src/control_plane/client/mod.rs         |   7 +
 proxy/src/control_plane/client/neon.rs        |   2 +-
 proxy/src/control_plane/messages.rs           |  10 +
 6 files changed, 634 insertions(+), 2 deletions(-)
 create mode 100644 proxy/src/control_plane/client/cplane_proxy_v1.rs

diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 84a572dcf9..1bad7b3086 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -70,6 +70,10 @@ impl std::fmt::Display for Backend<'_, ()> {
     fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
             Self::ControlPlane(api, ()) => match &**api {
+                ControlPlaneClient::ProxyV1(endpoint) => fmt
+                    .debug_tuple("ControlPlane::ProxyV1")
+                    .field(&endpoint.url())
+                    .finish(),
                 ControlPlaneClient::Neon(endpoint) => fmt
                     .debug_tuple("ControlPlane::Neon")
                     .field(&endpoint.url())
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index c929b97d78..99144acef0 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -46,6 +46,9 @@ enum AuthBackendType {
     #[value(name("console"), alias("cplane"))]
     ControlPlane,
 
+    #[value(name("cplane-v1"), alias("control-plane"))]
+    ControlPlaneV1,
+
     #[value(name("link"), alias("control-redirect"))]
     ConsoleRedirect,
 
@@ -518,6 +521,39 @@ async fn main() -> anyhow::Result<()> {
                         .instrument(span),
                 );
             }
+        } else if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
+            match (redis_notifications_client, regional_redis_client.clone()) {
+                (None, None) => {}
+                (client1, client2) => {
+                    let cache = api.caches.project_info.clone();
+                    if let Some(client) = client1 {
+                        maintenance_tasks.spawn(notifications::task_main(
+                            client,
+                            cache.clone(),
+                            cancel_map.clone(),
+                            args.region.clone(),
+                        ));
+                    }
+                    if let Some(client) = client2 {
+                        maintenance_tasks.spawn(notifications::task_main(
+                            client,
+                            cache.clone(),
+                            cancel_map.clone(),
+                            args.region.clone(),
+                        ));
+                    }
+                    maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+                }
+            }
+            if let Some(regional_redis_client) = regional_redis_client {
+                let cache = api.caches.endpoints_cache.clone();
+                let con = regional_redis_client;
+                let span = tracing::info_span!("endpoints_cache");
+                maintenance_tasks.spawn(
+                    async move { cache.do_read(con, cancellation_token.clone()).await }
+                        .instrument(span),
+                );
+            }
         }
     }
 
@@ -662,6 +698,65 @@ fn build_auth_backend(
     args: &ProxyCliArgs,
 ) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
     match &args.auth_backend {
+        AuthBackendType::ControlPlaneV1 => {
+            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
+            let project_info_cache_config: ProjectInfoCacheOptions =
+                args.project_info_cache.parse()?;
+            let endpoint_cache_config: config::EndpointCacheConfig =
+                args.endpoint_cache_config.parse()?;
+
+            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
+            info!(
+                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
+            );
+            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
+            let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
+                wake_compute_cache_config,
+                project_info_cache_config,
+                endpoint_cache_config,
+            )));
+
+            let config::ConcurrencyLockOptions {
+                shards,
+                limiter,
+                epoch,
+                timeout,
+            } = args.wake_compute_lock.parse()?;
+            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
+            let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
+                "wake_compute_lock",
+                limiter,
+                shards,
+                timeout,
+                epoch,
+                &Metrics::get().wake_compute_lock,
+            )?));
+            tokio::spawn(locks.garbage_collect_worker());
+
+            let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
+
+            let endpoint = http::Endpoint::new(url, http::new_client());
+
+            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
+            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
+            let wake_compute_endpoint_rate_limiter =
+                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
+
+            let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new(
+                endpoint,
+                args.control_plane_token.clone(),
+                caches,
+                locks,
+                wake_compute_endpoint_rate_limiter,
+            );
+
+            let api = control_plane::client::ControlPlaneClient::ProxyV1(api);
+            let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
+            let config = Box::leak(Box::new(auth_backend));
+
+            Ok(Either::Left(config))
+        }
+
         AuthBackendType::ControlPlane => {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let project_info_cache_config: ProjectInfoCacheOptions =
@@ -697,13 +792,15 @@ fn build_auth_backend(
             )?));
             tokio::spawn(locks.garbage_collect_worker());
 
-            let url = args.auth_endpoint.parse()?;
+            let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
+
             let endpoint = http::Endpoint::new(url, http::new_client());
 
             let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
             RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
             let wake_compute_endpoint_rate_limiter =
                 Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
+
             let api = control_plane::client::neon::NeonControlPlaneClient::new(
                 endpoint,
                 args.control_plane_token.clone(),
diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs
new file mode 100644
index 0000000000..e33a37f643
--- /dev/null
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -0,0 +1,514 @@
+//! Production console backend.
+
+use std::sync::Arc;
+use std::time::Duration;
+
+use ::http::header::AUTHORIZATION;
+use ::http::HeaderName;
+use futures::TryFutureExt;
+use postgres_client::config::SslMode;
+use tokio::time::Instant;
+use tracing::{debug, info, info_span, warn, Instrument};
+
+use super::super::messages::{ControlPlaneErrorMessage, GetEndpointAccessControl, WakeCompute};
+use crate::auth::backend::jwt::AuthRule;
+use crate::auth::backend::ComputeUserInfo;
+use crate::cache::Cached;
+use crate::context::RequestContext;
+use crate::control_plane::caches::ApiCaches;
+use crate::control_plane::errors::{
+    ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
+};
+use crate::control_plane::locks::ApiLocks;
+use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
+use crate::control_plane::{
+    AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo,
+};
+use crate::metrics::{CacheOutcome, Metrics};
+use crate::rate_limiter::WakeComputeRateLimiter;
+use crate::types::{EndpointCacheKey, EndpointId};
+use crate::{compute, http, scram};
+
+const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
+
+#[derive(Clone)]
+pub struct NeonControlPlaneClient {
+    endpoint: http::Endpoint,
+    pub caches: &'static ApiCaches,
+    pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
+    pub(crate) wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
+    // put in a shared ref so we don't copy secrets all over in memory
+    jwt: Arc<str>,
+}
+
+impl NeonControlPlaneClient {
+    /// Construct an API object containing the auth parameters.
+    pub fn new(
+        endpoint: http::Endpoint,
+        jwt: Arc<str>,
+        caches: &'static ApiCaches,
+        locks: &'static ApiLocks<EndpointCacheKey>,
+        wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
+    ) -> Self {
+        Self {
+            endpoint,
+            caches,
+            locks,
+            wake_compute_endpoint_rate_limiter,
+            jwt,
+        }
+    }
+
+    pub(crate) fn url(&self) -> &str {
+        self.endpoint.url().as_str()
+    }
+
+    async fn do_get_auth_info(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<AuthInfo, GetAuthInfoError> {
+        if !self
+            .caches
+            .endpoints_cache
+            .is_valid(ctx, &user_info.endpoint.normalize())
+        {
+            // TODO: refactor this because it's weird
+            // this is a failure to authenticate but we return Ok.
+            info!("endpoint is not valid, skipping the request");
+            return Ok(AuthInfo::default());
+        }
+        let request_id = ctx.session_id().to_string();
+        let application_name = ctx.console_application_name();
+        async {
+            let request = self
+                .endpoint
+                .get_path("get_endpoint_access_control")
+                .header(X_REQUEST_ID, &request_id)
+                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
+                .query(&[("session_id", ctx.session_id())])
+                .query(&[
+                    ("application_name", application_name.as_str()),
+                    ("endpointish", user_info.endpoint.as_str()),
+                    ("role", user_info.user.as_str()),
+                ])
+                .build()?;
+
+            debug!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
+            let response = self.endpoint.execute(request).await?;
+            drop(pause);
+            info!(duration = ?start.elapsed(), "received http response");
+            let body = match parse_body::<GetEndpointAccessControl>(response).await {
+                Ok(body) => body,
+                // Error 404 is special: it's ok not to have a secret.
+                // TODO(anna): retry
+                Err(e) => {
+                    return if e.get_reason().is_not_found() {
+                        // TODO: refactor this because it's weird
+                        // this is a failure to authenticate but we return Ok.
+                        Ok(AuthInfo::default())
+                    } else {
+                        Err(e.into())
+                    };
+                }
+            };
+
+            // Ivan: don't know where it will be used, so I leave it here
+            let _endpoint_vpc_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default();
+
+            let secret = if body.role_secret.is_empty() {
+                None
+            } else {
+                let secret = scram::ServerSecret::parse(&body.role_secret)
+                    .map(AuthSecret::Scram)
+                    .ok_or(GetAuthInfoError::BadSecret)?;
+                Some(secret)
+            };
+            let allowed_ips = body.allowed_ips.unwrap_or_default();
+            Metrics::get()
+                .proxy
+                .allowed_ips_number
+                .observe(allowed_ips.len() as f64);
+            Ok(AuthInfo {
+                secret,
+                allowed_ips,
+                project_id: body.project_id,
+            })
+        }
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_get_auth_info"))
+        .await
+    }
+
+    async fn do_get_endpoint_jwks(
+        &self,
+        ctx: &RequestContext,
+        endpoint: EndpointId,
+    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
+        if !self
+            .caches
+            .endpoints_cache
+            .is_valid(ctx, &endpoint.normalize())
+        {
+            return Err(GetEndpointJwksError::EndpointNotFound);
+        }
+        let request_id = ctx.session_id().to_string();
+        async {
+            let request = self
+                .endpoint
+                .get_with_url(|url| {
+                    url.path_segments_mut()
+                        .push("endpoints")
+                        .push(endpoint.as_str())
+                        .push("jwks");
+                })
+                .header(X_REQUEST_ID, &request_id)
+                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
+                .query(&[("session_id", ctx.session_id())])
+                .build()
+                .map_err(GetEndpointJwksError::RequestBuild)?;
+
+            debug!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
+            let response = self
+                .endpoint
+                .execute(request)
+                .await
+                .map_err(GetEndpointJwksError::RequestExecute)?;
+            drop(pause);
+            info!(duration = ?start.elapsed(), "received http response");
+
+            let body = parse_body::<EndpointJwksResponse>(response).await?;
+
+            let rules = body
+                .jwks
+                .into_iter()
+                .map(|jwks| AuthRule {
+                    id: jwks.id,
+                    jwks_url: jwks.jwks_url,
+                    audience: jwks.jwt_audience,
+                    role_names: jwks.role_names,
+                })
+                .collect();
+
+            Ok(rules)
+        }
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_get_endpoint_jwks"))
+        .await
+    }
+
+    async fn do_wake_compute(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<NodeInfo, WakeComputeError> {
+        let request_id = ctx.session_id().to_string();
+        let application_name = ctx.console_application_name();
+        async {
+            let mut request_builder = self
+                .endpoint
+                .get_path("wake_compute")
+                .header("X-Request-ID", &request_id)
+                .header("Authorization", format!("Bearer {}", &self.jwt))
+                .query(&[("session_id", ctx.session_id())])
+                .query(&[
+                    ("application_name", application_name.as_str()),
+                    ("endpointish", user_info.endpoint.as_str()),
+                ]);
+
+            let options = user_info.options.to_deep_object();
+            if !options.is_empty() {
+                request_builder = request_builder.query(&options);
+            }
+
+            let request = request_builder.build()?;
+
+            debug!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
+            let response = self.endpoint.execute(request).await?;
+            drop(pause);
+            info!(duration = ?start.elapsed(), "received http response");
+            let body = parse_body::<WakeCompute>(response).await?;
+
+            // Unfortunately, ownership won't let us use `Option::ok_or` here.
+            let (host, port) = match parse_host_port(&body.address) {
+                None => return Err(WakeComputeError::BadComputeAddress(body.address)),
+                Some(x) => x,
+            };
+
+            // Don't set anything but host and port! This config will be cached.
+            // We'll set username and such later using the startup message.
+            // TODO: add more type safety (in progress).
+            let mut config = compute::ConnCfg::new(host.to_owned(), port);
+            config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
+
+            let node = NodeInfo {
+                config,
+                aux: body.aux,
+                allow_self_signed_compute: false,
+            };
+
+            Ok(node)
+        }
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_wake_compute"))
+        .await
+    }
+}
+
+impl super::ControlPlaneApi for NeonControlPlaneClient {
+    #[tracing::instrument(skip_all)]
+    async fn get_role_secret(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        let user = &user_info.user;
+        if let Some(role_secret) = self
+            .caches
+            .project_info
+            .get_role_secret(normalized_ep, user)
+        {
+            return Ok(role_secret);
+        }
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        if let Some(project_id) = auth_info.project_id {
+            let normalized_ep_int = normalized_ep.into();
+            self.caches.project_info.insert_role_secret(
+                project_id,
+                normalized_ep_int,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                Arc::new(auth_info.allowed_ips),
+            );
+            ctx.set_project_id(project_id);
+        }
+        // When we just got a secret, we don't need to invalidate it.
+        Ok(Cached::new_uncached(auth_info.secret))
+    }
+
+    async fn get_allowed_ips_and_secret(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
+            Metrics::get()
+                .proxy
+                .allowed_ips_cache_misses
+                .inc(CacheOutcome::Hit);
+            return Ok((allowed_ips, None));
+        }
+        Metrics::get()
+            .proxy
+            .allowed_ips_cache_misses
+            .inc(CacheOutcome::Miss);
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let user = &user_info.user;
+        if let Some(project_id) = auth_info.project_id {
+            let normalized_ep_int = normalized_ep.into();
+            self.caches.project_info.insert_role_secret(
+                project_id,
+                normalized_ep_int,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
+            ctx.set_project_id(project_id);
+        }
+        Ok((
+            Cached::new_uncached(allowed_ips),
+            Some(Cached::new_uncached(auth_info.secret)),
+        ))
+    }
+
+    #[tracing::instrument(skip_all)]
+    async fn get_endpoint_jwks(
+        &self,
+        ctx: &RequestContext,
+        endpoint: EndpointId,
+    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
+        self.do_get_endpoint_jwks(ctx, endpoint).await
+    }
+
+    #[tracing::instrument(skip_all)]
+    async fn wake_compute(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedNodeInfo, WakeComputeError> {
+        let key = user_info.endpoint_cache_key();
+
+        macro_rules! check_cache {
+            () => {
+                if let Some(cached) = self.caches.node_info.get(&key) {
+                    let (cached, info) = cached.take_value();
+                    let info = info.map_err(|c| {
+                        info!(key = &*key, "found cached wake_compute error");
+                        WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c)))
+                    })?;
+
+                    debug!(key = &*key, "found cached compute node info");
+                    ctx.set_project(info.aux.clone());
+                    return Ok(cached.map(|()| info));
+                }
+            };
+        }
+
+        // Every time we do a wakeup http request, the compute node will stay up
+        // for some time (highly depends on the console's scale-to-zero policy);
+        // The connection info remains the same during that period of time,
+        // which means that we might cache it to reduce the load and latency.
+        check_cache!();
+
+        let permit = self.locks.get_permit(&key).await?;
+
+        // after getting back a permit - it's possible the cache was filled
+        // double check
+        if permit.should_check_cache() {
+            // TODO: if there is something in the cache, mark the permit as success.
+            check_cache!();
+        }
+
+        // check rate limit
+        if !self
+            .wake_compute_endpoint_rate_limiter
+            .check(user_info.endpoint.normalize_intern(), 1)
+        {
+            return Err(WakeComputeError::TooManyConnections);
+        }
+
+        let node = permit.release_result(self.do_wake_compute(ctx, user_info).await);
+        match node {
+            Ok(node) => {
+                ctx.set_project(node.aux.clone());
+                debug!(key = &*key, "created a cache entry for woken compute node");
+
+                let mut stored_node = node.clone();
+                // store the cached node as 'warm_cached'
+                stored_node.aux.cold_start_info = ColdStartInfo::WarmCached;
+
+                let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node));
+
+                Ok(cached.map(|()| node))
+            }
+            Err(err) => match err {
+                WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => {
+                    let Some(status) = &err.status else {
+                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                            err,
+                        )));
+                    };
+
+                    let reason = status
+                        .details
+                        .error_info
+                        .map_or(Reason::Unknown, |x| x.reason);
+
+                    // if we can retry this error, do not cache it.
+                    if reason.can_retry() {
+                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                            err,
+                        )));
+                    }
+
+                    // at this point, we should only have quota errors.
+                    debug!(
+                        key = &*key,
+                        "created a cache entry for the wake compute error"
+                    );
+
+                    self.caches.node_info.insert_ttl(
+                        key,
+                        Err(err.clone()),
+                        Duration::from_secs(30),
+                    );
+
+                    Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                        err,
+                    )))
+                }
+                err => return Err(err),
+            },
+        }
+    }
+}
+
+/// Parse http response body, taking status code into account.
+async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
+    response: http::Response,
+) -> Result<T, ControlPlaneError> {
+    let status = response.status();
+    if status.is_success() {
+        // We shouldn't log raw body because it may contain secrets.
+        info!("request succeeded, processing the body");
+        return Ok(response.json().await?);
+    }
+    let s = response.bytes().await?;
+    // Log plaintext to be able to detect, whether there are some cases not covered by the error struct.
+    info!("response_error plaintext: {:?}", s);
+
+    // Don't throw an error here because it's not as important
+    // as the fact that the request itself has failed.
+    let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
+        warn!("failed to parse error body: {e}");
+        ControlPlaneErrorMessage {
+            error: "reason unclear (malformed error message)".into(),
+            http_status_code: status,
+            status: None,
+        }
+    });
+    body.http_status_code = status;
+
+    warn!("console responded with an error ({status}): {body:?}");
+    Err(ControlPlaneError::Message(Box::new(body)))
+}
+
+fn parse_host_port(input: &str) -> Option<(&str, u16)> {
+    let (host, port) = input.rsplit_once(':')?;
+    let ipv6_brackets: &[_] = &['[', ']'];
+    Some((host.trim_matches(ipv6_brackets), port.parse().ok()?))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_host_port_v4() {
+        let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse");
+        assert_eq!(host, "127.0.0.1");
+        assert_eq!(port, 5432);
+    }
+
+    #[test]
+    fn test_parse_host_port_v6() {
+        let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse");
+        assert_eq!(host, "2001:db8::1");
+        assert_eq!(port, 5432);
+    }
+
+    #[test]
+    fn test_parse_host_port_url() {
+        let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432")
+            .expect("failed to parse");
+        assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local");
+        assert_eq!(port, 5432);
+    }
+}
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
index f8f74372f0..7ef5a9c9fd 100644
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -1,3 +1,4 @@
+pub mod cplane_proxy_v1;
 #[cfg(any(test, feature = "testing"))]
 pub mod mock;
 pub mod neon;
@@ -27,6 +28,8 @@ use crate::types::EndpointId;
 #[non_exhaustive]
 #[derive(Clone)]
 pub enum ControlPlaneClient {
+    /// New Proxy V1 control plane API
+    ProxyV1(cplane_proxy_v1::NeonControlPlaneClient),
     /// Current Management API (V2).
     Neon(neon::NeonControlPlaneClient),
     /// Local mock control plane.
@@ -45,6 +48,7 @@ impl ControlPlaneApi for ControlPlaneClient {
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
         match self {
+            Self::ProxyV1(api) => api.get_role_secret(ctx, user_info).await,
             Self::Neon(api) => api.get_role_secret(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await,
@@ -61,6 +65,7 @@ impl ControlPlaneApi for ControlPlaneClient {
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
         match self {
+            Self::ProxyV1(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             Self::Neon(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
@@ -75,6 +80,7 @@ impl ControlPlaneApi for ControlPlaneClient {
         endpoint: EndpointId,
     ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
         match self {
+            Self::ProxyV1(api) => api.get_endpoint_jwks(ctx, endpoint).await,
             Self::Neon(api) => api.get_endpoint_jwks(ctx, endpoint).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await,
@@ -89,6 +95,7 @@ impl ControlPlaneApi for ControlPlaneClient {
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
         match self {
+            Self::ProxyV1(api) => api.wake_compute(ctx, user_info).await,
             Self::Neon(api) => api.wake_compute(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await,
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 5c204ae1d7..bf62c0d6ab 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -1,4 +1,4 @@
-//! Production console backend.
+//! Stale console backend, remove after migrating to Proxy V1 API (#15245).
 
 use std::sync::Arc;
 use std::time::Duration;
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index 8762ba874b..2662ab85f9 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -230,6 +230,16 @@ pub(crate) struct GetRoleSecret {
     pub(crate) project_id: Option<ProjectIdInt>,
 }
 
+/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
+/// Returned by the `/get_endpoint_access_control` API method.
+#[derive(Deserialize)]
+pub(crate) struct GetEndpointAccessControl {
+    pub(crate) role_secret: Box<str>,
+    pub(crate) allowed_ips: Option<Vec<IpPattern>>,
+    pub(crate) project_id: Option<ProjectIdInt>,
+    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<EndpointIdInt>>,
+}
+
 // Manually implement debug to omit sensitive info.
 impl fmt::Debug for GetRoleSecret {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {

From def05700d56bc23d0b805abe248644540ab18ad8 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 12 Dec 2024 06:02:08 +0000
Subject: [PATCH 196/209] Proxy release 2024-12-12


From 758680d4f874c9f290176bfc2124361472ba4e4f Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 17 Dec 2024 22:06:42 +0000
Subject: [PATCH 197/209] Proxy release 2024-12-17


From 9c92242ca002d6b35196ce1f2a6819e64da442f9 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 9 Jan 2025 06:02:06 +0000
Subject: [PATCH 198/209] Proxy release 2025-01-09


From 44b4e355a2e3ba0e4d32293123c790c06700afb3 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 16 Jan 2025 06:02:04 +0000
Subject: [PATCH 199/209] Proxy release 2025-01-16


From 8b4088dd8a2842746e7fb159e100a940c7e66e6d Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 30 Jan 2025 06:02:00 +0000
Subject: [PATCH 200/209] Proxy release 2025-01-30


From fedf4f169c5a7641e1d5233289af4e91aaba8274 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 6 Feb 2025 06:02:11 +0000
Subject: [PATCH 201/209] Proxy release 2025-02-06


From c65fc5a955cce7af5fc3f4e9ce070f4faf44b47f Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 13 Feb 2025 06:02:01 +0000
Subject: [PATCH 202/209] Proxy release 2025-02-13


From 446b3f9d28ecbcad95044032b5dc37af367d6111 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 20 Feb 2025 06:02:01 +0000
Subject: [PATCH 203/209] Proxy release 2025-02-20


From 332f064a4275452fdd418a7e38d65ada539596b6 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 27 Feb 2025 00:17:57 +0000
Subject: [PATCH 204/209] Proxy release 2025-02-27


From a1b9528757011e24ac6c8abb03ab03f62395e51a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 27 Feb 2025 16:18:42 +0000
Subject: [PATCH 205/209] Proxy release 2025-02-27


From 6c86fe7143faf0117f64b21663e5c0285d08ce0a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 6 Mar 2025 06:02:15 +0000
Subject: [PATCH 206/209] Proxy release 2025-03-06


From 93350f7018f7a8c1e58886d910f1d608c546bda3 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 25 Jun 2025 15:19:20 +0100
Subject: [PATCH 207/209] [proxy]: BatchQueue::call is not cancel safe - make
 it directly cancellation aware (#12345)

## Problem

https://github.com/neondatabase/cloud/issues/30539

If the current leader cancels the `call` function, then it has removed
the jobs from the queue, but will never finish sending the responses.
Because of this, it is not cancellation safe.

## Summary of changes

Document these functions as not cancellation safe. Move cancellation of
the queued jobs into the queue itself.

## Alternatives considered

1. We could spawn the task that runs the batch, since that won't get
cancelled.
* This requires `fn call(self: Arc<Self>)` or `fn call(&'static self)`.
2. We could add another scopeguard and return the requests back to the
queue.
* This requires that requests are always retry safe, and also requires
requests to be `Clone`.
---
 proxy/src/batch.rs        | 80 +++++++++++++++++++++++++-----------
 proxy/src/cancellation.rs | 85 +++++++++++++++++++++++----------------
 proxy/src/redis/keys.rs   |  3 +-
 3 files changed, 109 insertions(+), 59 deletions(-)

diff --git a/proxy/src/batch.rs b/proxy/src/batch.rs
index 61bdf2b747..33e08797f2 100644
--- a/proxy/src/batch.rs
+++ b/proxy/src/batch.rs
@@ -6,7 +6,6 @@ use std::collections::BTreeMap;
 use std::pin::pin;
 use std::sync::Mutex;
 
-use futures::future::Either;
 use scopeguard::ScopeGuard;
 use tokio::sync::oneshot::error::TryRecvError;
 
@@ -49,37 +48,67 @@ impl<P: QueueProcessing> BatchQueue<P> {
         }
     }
 
-    pub async fn call(&self, req: P::Req) -> P::Res {
+    /// Perform a single request-response process, this may be batched internally.
+    ///
+    /// This function is not cancel safe.
+    pub async fn call<R>(
+        &self,
+        req: P::Req,
+        cancelled: impl Future<Output = R>,
+    ) -> Result<P::Res, R> {
         let (id, mut rx) = self.inner.lock_propagate_poison().register_job(req);
-        let guard = scopeguard::guard(id, move |id| {
-            let mut inner = self.inner.lock_propagate_poison();
-            if inner.queue.remove(&id).is_some() {
-                tracing::debug!("batched task cancelled before completion");
-            }
-        });
 
+        let mut cancelled = pin!(cancelled);
         let resp = loop {
             // try become the leader, or try wait for success.
-            let mut processor = match futures::future::select(rx, pin!(self.processor.lock())).await
-            {
-                // we got the resp.
-                Either::Left((resp, _)) => break resp.ok(),
-                // we are the leader.
-                Either::Right((p, rx_)) => {
-                    rx = rx_;
-                    p
-                }
+            let mut processor = tokio::select! {
+                // try become leader.
+                p = self.processor.lock() => p,
+                // wait for success.
+                resp = &mut rx => break resp.ok(),
+                // wait for cancellation.
+                cancel = cancelled.as_mut() => {
+                    let mut inner = self.inner.lock_propagate_poison();
+                    if inner.queue.remove(&id).is_some() {
+                        tracing::warn!("batched task cancelled before completion");
+                    }
+                    return Err(cancel);
+                },
             };
 
+            tracing::debug!(id, "batch: became leader");
             let (reqs, resps) = self.inner.lock_propagate_poison().get_batch(&processor);
 
+            // snitch incase the task gets cancelled.
+            let cancel_safety = scopeguard::guard((), |()| {
+                if !std::thread::panicking() {
+                    tracing::error!(
+                        id,
+                        "batch: leader cancelled, despite not being cancellation safe"
+                    );
+                }
+            });
+
             // apply a batch.
+            // if this is cancelled, jobs will not be completed and will panic.
             let values = processor.apply(reqs).await;
 
+            // good: we didn't get cancelled.
+            ScopeGuard::into_inner(cancel_safety);
+
+            if values.len() != resps.len() {
+                tracing::error!(
+                    "batch: invalid response size, expected={}, got={}",
+                    resps.len(),
+                    values.len()
+                );
+            }
+
             // send response values.
             for (tx, value) in std::iter::zip(resps, values) {
-                // sender hung up but that's fine.
-                drop(tx.send(value));
+                if tx.send(value).is_err() {
+                    // receiver hung up but that's fine.
+                }
             }
 
             match rx.try_recv() {
@@ -98,10 +127,9 @@ impl<P: QueueProcessing> BatchQueue<P> {
             }
         };
 
-        // already removed.
-        ScopeGuard::into_inner(guard);
+        tracing::debug!(id, "batch: job completed");
 
-        resp.expect("no response found. batch processer should not panic")
+        Ok(resp.expect("no response found. batch processer should not panic"))
     }
 }
 
@@ -125,6 +153,8 @@ impl<P: QueueProcessing> BatchQueueInner<P> {
 
         self.queue.insert(id, BatchJob { req, res: tx });
 
+        tracing::debug!(id, "batch: registered job in the queue");
+
         (id, rx)
     }
 
@@ -132,15 +162,19 @@ impl<P: QueueProcessing> BatchQueueInner<P> {
         let batch_size = p.batch_size(self.queue.len());
         let mut reqs = Vec::with_capacity(batch_size);
         let mut resps = Vec::with_capacity(batch_size);
+        let mut ids = Vec::with_capacity(batch_size);
 
         while reqs.len() < batch_size {
-            let Some((_, job)) = self.queue.pop_first() else {
+            let Some((id, job)) = self.queue.pop_first() else {
                 break;
             };
             reqs.push(job.req);
             resps.push(job.res);
+            ids.push(id);
         }
 
+        tracing::debug!(ids=?ids, "batch: acquired jobs");
+
         (reqs, resps)
     }
 }
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 036f36c7f6..ffc0cf43f1 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,5 +1,6 @@
 use std::convert::Infallible;
 use std::net::{IpAddr, SocketAddr};
+use std::pin::pin;
 use std::sync::{Arc, OnceLock};
 use std::time::Duration;
 
@@ -98,7 +99,6 @@ impl Pipeline {
 
 impl CancelKeyOp {
     fn register(&self, pipe: &mut Pipeline) {
-        #[allow(clippy::used_underscore_binding)]
         match self {
             CancelKeyOp::StoreCancelKey { key, value, expire } => {
                 let key = KeyPrefix::Cancel(*key).build_redis_key();
@@ -224,6 +224,7 @@ impl CancellationHandler {
         }
     }
 
+    /// This is not cancel safe
     async fn get_cancel_key(
         &self,
         key: CancelKeyData,
@@ -240,16 +241,21 @@ impl CancellationHandler {
         };
 
         const TIMEOUT: Duration = Duration::from_secs(5);
-        let result = timeout(TIMEOUT, tx.call((guard, op)))
-            .await
-            .map_err(|_| {
-                tracing::warn!("timed out waiting to receive GetCancelData response");
-                CancelError::RateLimit
-            })?
-            .map_err(|e| {
-                tracing::warn!("failed to receive GetCancelData response: {e}");
-                CancelError::InternalError
-            })?;
+        let result = timeout(
+            TIMEOUT,
+            tx.call((guard, op), std::future::pending::<Infallible>()),
+        )
+        .await
+        .map_err(|_| {
+            tracing::warn!("timed out waiting to receive GetCancelData response");
+            CancelError::RateLimit
+        })?
+        // cannot be cancelled
+        .unwrap_or_else(|x| match x {})
+        .map_err(|e| {
+            tracing::warn!("failed to receive GetCancelData response: {e}");
+            CancelError::InternalError
+        })?;
 
         let cancel_state_str = String::from_owned_redis_value(result).map_err(|e| {
             tracing::warn!("failed to receive GetCancelData response: {e}");
@@ -271,6 +277,8 @@ impl CancellationHandler {
     /// Will fetch IP allowlist internally.
     ///
     /// return Result primarily for tests
+    ///
+    /// This is not cancel safe
     pub(crate) async fn cancel_session<T: ControlPlaneApi>(
         &self,
         key: CancelKeyData,
@@ -394,6 +402,8 @@ impl Session {
 
     /// Ensure the cancel key is continously refreshed,
     /// but stop when the channel is dropped.
+    ///
+    /// This is not cancel safe
     pub(crate) async fn maintain_cancel_key(
         &self,
         session_id: uuid::Uuid,
@@ -401,27 +411,6 @@ impl Session {
         cancel_closure: &CancelClosure,
         compute_config: &ComputeConfig,
     ) {
-        futures::future::select(
-            std::pin::pin!(self.maintain_redis_cancel_key(cancel_closure)),
-            cancel,
-        )
-        .await;
-
-        if let Err(err) = cancel_closure
-            .try_cancel_query(compute_config)
-            .boxed()
-            .await
-        {
-            tracing::warn!(
-                ?session_id,
-                ?err,
-                "could not cancel the query in the database"
-            );
-        }
-    }
-
-    // Ensure the cancel key is continously refreshed.
-    async fn maintain_redis_cancel_key(&self, cancel_closure: &CancelClosure) -> ! {
         let Some(tx) = self.cancellation_handler.tx.get() else {
             tracing::warn!("cancellation handler is not available");
             // don't exit, as we only want to exit if cancelled externally.
@@ -432,6 +421,8 @@ impl Session {
             .expect("serialising to json string should not fail")
             .into_boxed_str();
 
+        let mut cancel = pin!(cancel);
+
         loop {
             let guard = Metrics::get()
                 .proxy
@@ -449,9 +440,35 @@ impl Session {
                 "registering cancellation key"
             );
 
-            if tx.call((guard, op)).await.is_ok() {
-                tokio::time::sleep(CANCEL_KEY_REFRESH).await;
+            match tx.call((guard, op), cancel.as_mut()).await {
+                Ok(Ok(_)) => {
+                    tracing::debug!(
+                        src=%self.key,
+                        dest=?cancel_closure.cancel_token,
+                        "registered cancellation key"
+                    );
+
+                    // wait before continuing.
+                    tokio::time::sleep(CANCEL_KEY_REFRESH).await;
+                }
+                // retry immediately.
+                Ok(Err(error)) => {
+                    tracing::warn!(?error, "error registering cancellation key");
+                }
+                Err(Err(_cancelled)) => break,
             }
         }
+
+        if let Err(err) = cancel_closure
+            .try_cancel_query(compute_config)
+            .boxed()
+            .await
+        {
+            tracing::warn!(
+                ?session_id,
+                ?err,
+                "could not cancel the query in the database"
+            );
+        }
     }
 }
diff --git a/proxy/src/redis/keys.rs b/proxy/src/redis/keys.rs
index b453e6851c..ffb7bc876b 100644
--- a/proxy/src/redis/keys.rs
+++ b/proxy/src/redis/keys.rs
@@ -23,9 +23,8 @@ impl KeyPrefix {
 
 #[cfg(test)]
 mod tests {
-    use crate::pqproto::id_to_cancel_key;
-
     use super::*;
+    use crate::pqproto::id_to_cancel_key;
 
     #[test]
     fn test_build_redis_key() {

From fcf812790060137eef579cf0a864a440ba57732f Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Wed, 2 Jul 2025 10:55:44 +0200
Subject: [PATCH 208/209] [proxy]: Fix redis IRSA expiration failure errors
 (#12430)

Relates to the
[#30688](https://github.com/neondatabase/cloud/issues/30688)
---
 proxy/src/cancellation.rs                     | 15 +++++++++++++
 .../connection_with_credentials_provider.rs   | 22 ++++++++++++++-----
 proxy/src/redis/kv_ops.rs                     |  6 ++++-
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index ffc0cf43f1..74413f1a7d 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -64,6 +64,13 @@ impl Pipeline {
         let responses = self.replies;
         let batch_size = self.inner.len();
 
+        if !client.credentials_refreshed() {
+            tracing::debug!(
+                "Redis credentials are not refreshed. Sleeping for 5 seconds before retrying..."
+            );
+            tokio::time::sleep(Duration::from_secs(5)).await;
+        }
+
         match client.query(&self.inner).await {
             // for each reply, we expect that many values.
             Ok(Value::Array(values)) if values.len() == responses => {
@@ -127,6 +134,14 @@ impl QueueProcessing for CancellationProcessor {
     }
 
     async fn apply(&mut self, batch: Vec<Self::Req>) -> Vec<Self::Res> {
+        if !self.client.credentials_refreshed() {
+            // this will cause a timeout for cancellation operations
+            tracing::debug!(
+                "Redis credentials are not refreshed. Sleeping for 5 seconds before retrying..."
+            );
+            tokio::time::sleep(Duration::from_secs(5)).await;
+        }
+
         let mut pipeline = Pipeline::with_capacity(batch.len());
 
         let batch_size = batch.len();
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index fe656557ac..510701cb27 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -1,4 +1,4 @@
-use std::sync::Arc;
+use std::sync::{Arc, atomic::AtomicBool, atomic::Ordering};
 use std::time::Duration;
 
 use futures::FutureExt;
@@ -33,6 +33,7 @@ pub struct ConnectionWithCredentialsProvider {
     con: Option<MultiplexedConnection>,
     refresh_token_task: Option<JoinHandle<()>>,
     mutex: tokio::sync::Mutex<()>,
+    credentials_refreshed: Arc<AtomicBool>,
 }
 
 impl Clone for ConnectionWithCredentialsProvider {
@@ -42,6 +43,7 @@ impl Clone for ConnectionWithCredentialsProvider {
             con: None,
             refresh_token_task: None,
             mutex: tokio::sync::Mutex::new(()),
+            credentials_refreshed: Arc::new(AtomicBool::new(false)),
         }
     }
 }
@@ -65,6 +67,7 @@ impl ConnectionWithCredentialsProvider {
             con: None,
             refresh_token_task: None,
             mutex: tokio::sync::Mutex::new(()),
+            credentials_refreshed: Arc::new(AtomicBool::new(false)),
         }
     }
 
@@ -78,6 +81,7 @@ impl ConnectionWithCredentialsProvider {
             con: None,
             refresh_token_task: None,
             mutex: tokio::sync::Mutex::new(()),
+            credentials_refreshed: Arc::new(AtomicBool::new(true)),
         }
     }
 
@@ -85,6 +89,10 @@ impl ConnectionWithCredentialsProvider {
         redis::cmd("PING").query_async(con).await
     }
 
+    pub(crate) fn credentials_refreshed(&self) -> bool {
+        self.credentials_refreshed.load(Ordering::Relaxed)
+    }
+
     pub(crate) async fn connect(&mut self) -> anyhow::Result<()> {
         let _guard = self.mutex.lock().await;
         if let Some(con) = self.con.as_mut() {
@@ -112,11 +120,15 @@ impl ConnectionWithCredentialsProvider {
         if let Credentials::Dynamic(credentials_provider, _) = &self.credentials {
             let credentials_provider = credentials_provider.clone();
             let con2 = con.clone();
+            let credentials_refreshed = self.credentials_refreshed.clone();
             let f = tokio::spawn(async move {
-                Self::keep_connection(con2, credentials_provider)
-                    .await
-                    .inspect_err(|e| debug!("keep_connection failed: {e}"))
-                    .ok();
+                let result = Self::keep_connection(con2, credentials_provider).await;
+                if let Err(e) = result {
+                    credentials_refreshed.store(false, Ordering::Release);
+                    debug!("keep_connection failed: {e}");
+                } else {
+                    credentials_refreshed.store(true, Ordering::Release);
+                }
             });
             self.refresh_token_task = Some(f);
         }
diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs
index 671fe09b0b..cfdbc21839 100644
--- a/proxy/src/redis/kv_ops.rs
+++ b/proxy/src/redis/kv_ops.rs
@@ -40,6 +40,10 @@ impl RedisKVClient {
             .inspect_err(|e| tracing::error!("failed to connect to redis: {e}"))
     }
 
+    pub(crate) fn credentials_refreshed(&self) -> bool {
+        self.client.credentials_refreshed()
+    }
+
     pub(crate) async fn query<T: FromRedisValue>(
         &mut self,
         q: &impl Queryable,
@@ -49,7 +53,7 @@ impl RedisKVClient {
             Err(e) => e,
         };
 
-        tracing::error!("failed to run query: {e}");
+        tracing::debug!("failed to run query: {e}");
         match e.retry_method() {
             redis::RetryMethod::Reconnect => {
                 tracing::info!("Redis client is disconnected. Reconnecting...");

From 1a7c2450f5ab86d6cff4bb1195b060c769fa0f3c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 3 Jul 2025 10:51:35 +0100
Subject: [PATCH 209/209] fix redis credentials check (#12455)

## Problem

`keep_connection` does not exit, so it was never setting
`credentials_refreshed`.

## Summary of changes

Set `credentials_refreshed` to true when we first establish a
connection, and after we re-authenticate the connection.
---
 .../connection_with_credentials_provider.rs   | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index 510701cb27..0465493799 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -1,3 +1,4 @@
+use std::convert::Infallible;
 use std::sync::{Arc, atomic::AtomicBool, atomic::Ordering};
 use std::time::Duration;
 
@@ -5,7 +6,7 @@ use futures::FutureExt;
 use redis::aio::{ConnectionLike, MultiplexedConnection};
 use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult};
 use tokio::task::JoinHandle;
-use tracing::{debug, error, info, warn};
+use tracing::{error, info, warn};
 
 use super::elasticache::CredentialsProvider;
 
@@ -31,7 +32,7 @@ pub struct ConnectionWithCredentialsProvider {
     credentials: Credentials,
     // TODO: with more load on the connection, we should consider using a connection pool
     con: Option<MultiplexedConnection>,
-    refresh_token_task: Option<JoinHandle<()>>,
+    refresh_token_task: Option<JoinHandle<Infallible>>,
     mutex: tokio::sync::Mutex<()>,
     credentials_refreshed: Arc<AtomicBool>,
 }
@@ -121,15 +122,11 @@ impl ConnectionWithCredentialsProvider {
             let credentials_provider = credentials_provider.clone();
             let con2 = con.clone();
             let credentials_refreshed = self.credentials_refreshed.clone();
-            let f = tokio::spawn(async move {
-                let result = Self::keep_connection(con2, credentials_provider).await;
-                if let Err(e) = result {
-                    credentials_refreshed.store(false, Ordering::Release);
-                    debug!("keep_connection failed: {e}");
-                } else {
-                    credentials_refreshed.store(true, Ordering::Release);
-                }
-            });
+            let f = tokio::spawn(Self::keep_connection(
+                con2,
+                credentials_provider,
+                credentials_refreshed,
+            ));
             self.refresh_token_task = Some(f);
         }
         match Self::ping(&mut con).await {
@@ -165,6 +162,7 @@ impl ConnectionWithCredentialsProvider {
 
     async fn get_client(&self) -> anyhow::Result<redis::Client> {
         let client = redis::Client::open(self.get_connection_info().await?)?;
+        self.credentials_refreshed.store(true, Ordering::Relaxed);
         Ok(client)
     }
 
@@ -180,16 +178,19 @@ impl ConnectionWithCredentialsProvider {
     async fn keep_connection(
         mut con: MultiplexedConnection,
         credentials_provider: Arc<CredentialsProvider>,
-    ) -> anyhow::Result<()> {
+        credentials_refreshed: Arc<AtomicBool>,
+    ) -> Infallible {
         loop {
             // The connection lives for 12h, for the sanity check we refresh it every hour.
             tokio::time::sleep(Duration::from_secs(60 * 60)).await;
             match Self::refresh_token(&mut con, credentials_provider.clone()).await {
                 Ok(()) => {
                     info!("Token refreshed");
+                    credentials_refreshed.store(true, Ordering::Relaxed);
                 }
                 Err(e) => {
                     error!("Error during token refresh: {e:?}");
+                    credentials_refreshed.store(false, Ordering::Relaxed);
                 }
             }
         }